xref: /aosp_15_r20/external/libdav1d/src/x86/filmgrain16_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28%include "x86/filmgrain_common.asm"
29
30SECTION_RODATA 16
31pd_16: times 4 dd 16
32pw_1: times 8 dw 1
33pw_16384: times 8 dw 16384
34pw_8192: times 8 dw 8192
35pw_23_22: dw 23, 22
36          times 3 dw 0, 32
37pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
38pw_27_17_17_27: dw 27, 17, 17, 27
39                times 2 dw 0, 32
40rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
41pw_seed_xor: times 2 dw 0xb524
42             times 2 dw 0x49d8
43pb_1: times 4 db 1
44hmul_bits: dw 32768, 16384, 8192, 4096
45round: dw 2048, 1024, 512
46mul_bits: dw 256, 128, 64, 32, 16
47round_vals: dw 32, 64, 128, 256, 512, 1024
48max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
49min: dw 0, 16*4, 16*16
50; these two should be next to each other
51pw_4: times 2 dw 4
52pw_16: times 2 dw 16
53
54%macro JMP_TABLE 1-*
55    %xdefine %1_table %%table
56    %xdefine %%base %1_table
57    %xdefine %%prefix mangle(private_prefix %+ _%1)
58    %%table:
59    %rep %0 - 1
60        dd %%prefix %+ .ar%2 - %%base
61        %rotate 1
62    %endrep
63%endmacro
64
65JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3
66JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3
67JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3
68JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
69
70SECTION .text
71
72%if ARCH_X86_32
73%undef base
74%define PIC_ptr(a) base+a
75%else
76%define PIC_ptr(a) a
77%endif
78
79%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
80
81%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg
82%assign %%idx 0
83%define %%tmp %2
84%if %0 == 8
85%define %%tmp %8
86%endif
87%rep (%6/2)
88%if %%idx == 0
89    movd        %5 %+ d, %2
90    pshuflw       %%tmp, %2, q3232
91%else
92    movd        %5 %+ d, %%tmp
93%if %6 == 8
94%if %%idx == 2
95    punpckhqdq    %%tmp, %%tmp
96%elif %%idx == 4
97    psrlq         %%tmp, 32
98%endif
99%endif
100%endif
101    movzx       %4 %+ d, %5 %+ w
102    shr         %5 %+ d, 16
103
104%if %%idx == 0
105    movd             %1, [%3+%4*%7]
106%else
107    pinsrw           %1, [%3+%4*%7], %%idx + 0
108%endif
109    pinsrw           %1, [%3+%5*%7], %%idx + 1
110%assign %%idx %%idx+2
111%endrep
112%endmacro
113
114%macro SPLATD 2 ; dst, src
115%ifnidn %1, %2
116    movd %1, %2
117%endif
118    pshufd %1, %1, q0000
119%endmacro
120
121%macro SPLATW 2 ; dst, src
122%ifnidn %1, %2
123    movd %1, %2
124%endif
125    pshuflw %1, %1, q0000
126    punpcklqdq %1, %1
127%endmacro
128
129
130INIT_XMM ssse3
131%if ARCH_X86_64
132cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax
133    lea              r4, [pb_mask]
134%define base r4-pb_mask
135%else
136cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
137    LEA              r4, $$
138%define base r4-$$
139%endif
140    movq             m1, [base+rnd_next_upperbit_mask]
141    movq             m4, [base+mul_bits]
142    movq             m7, [base+hmul_bits]
143    mov             r3d, [fg_dataq+FGData.grain_scale_shift]
144    lea             r5d, [bdmaxq+1]
145    shr             r5d, 11             ; 0 for 10bpc, 2 for 12bpc
146    sub              r3, r5
147    SPLATW           m6, [base+round+r3*2-2]
148    mova             m5, [base+pb_mask]
149    SPLATW           m0, [fg_dataq+FGData.seed]
150    mov              r3, -73*82*2
151    sub            bufq, r3
152%if ARCH_X86_64
153    lea              r6, [gaussian_sequence]
154%endif
155.loop:
156    pand             m2, m0, m1
157    psrlw            m3, m2, 10
158    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
159    pmullw           m2, m4             ; bits 0x0f00 are set
160    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
161    psllq            m2, m3, 30
162    por              m2, m3
163    psllq            m3, m2, 15
164    por              m2, m3             ; aggregate each bit into next seed's high bit
165    pmulhuw          m3, m0, m7
166    por              m2, m3             ; 4 next output seeds
167    pshuflw          m0, m2, q3333
168    psrlw            m2, 5
169%if ARCH_X86_64
170    vpgatherdw       m3, m2, r6, r5, r7, 4, 2
171%else
172    vpgatherdw       m3, m2, base+gaussian_sequence, r5, r2, 4, 2
173%endif
174    paddw            m3, m3             ; otherwise bpc=12 w/ grain_scale_shift=0
175                                        ; shifts by 0, which pmulhrsw does not support
176    pmulhrsw         m3, m6
177    movq      [bufq+r3], m3
178    add              r3, 4*2
179    jl .loop
180
181    ; auto-regression code
182    movsxd           r3, [fg_dataq+FGData.ar_coeff_lag]
183    movsxd           r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4]
184    lea              r3, [r3+base+generate_grain_y_16bpc_ssse3_table]
185    jmp              r3
186
187.ar1:
188%if WIN64
189    DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0
190    lea            bufq, [r0-2*(82*73-(82*3+79))]
191    PUSH             r8
192%else
193%if ARCH_X86_64
194    DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
195%else ; x86-32
196    DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0
197    PUSH             r6
198%define shiftd r1d
199%endif
200    sub            bufq, 2*(82*73-(82*3+79))
201%endif
202    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
203    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
204    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
205%if WIN64
206    DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0
207%elif ARCH_X86_64
208    DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
209%else ; x86-32
210%undef shiftd
211    DEFINE_ARGS buf, shift, min, val3, x, cf3, val0
212%define hd dword r0m
213%define maxd dword minm
214%endif
215%if cpuflag(sse4)
216    pmovsxbw         m4, m4
217%else
218    pxor             m3, m3
219    pcmpgtb          m3, m4
220    punpcklbw        m4, m3
221%endif
222    pinsrw           m4, [base+pw_1], 3
223    pshufd           m5, m4, q1111
224    pshufd           m4, m4, q0000
225    SPLATW           m3, [base+round_vals+shiftq*2-12]    ; rnd
226    mov              hd, 70
227    sar            maxd, 1
228    mov            mind, maxd
229    xor            mind, -1
230.y_loop_ar1:
231    mov              xq, -76
232    movsx         val3d, word [bufq+xq*2-2]
233.x_loop_ar1:
234    movu             m0, [bufq+xq*2-82*2-2]     ; top/left
235    psrldq           m2, m0, 2                  ; top
236    psrldq           m1, m0, 4                  ; top/right
237    punpcklwd        m0, m2
238    punpcklwd        m1, m3
239    pmaddwd          m0, m4
240    pmaddwd          m1, m5
241    paddd            m0, m1
242.x_loop_ar1_inner:
243    movd          val0d, m0
244    psrldq           m0, 4
245    imul          val3d, cf3d
246    add           val3d, val0d
247    sar           val3d, shiftb
248    movsx         val0d, word [bufq+xq*2]
249    add           val3d, val0d
250    cmp           val3d, maxd
251    cmovg         val3d, maxd
252    cmp           val3d, mind
253    cmovl         val3d, mind
254    mov word [bufq+xq*2], val3w
255    ; keep val3d in-place as left for next x iteration
256    inc              xq
257    jz .x_loop_ar1_end
258    test             xq, 3
259    jnz .x_loop_ar1_inner
260    jmp .x_loop_ar1
261
262.x_loop_ar1_end:
263    add            bufq, 82*2
264    dec              hd
265    jg .y_loop_ar1
266%if WIN64
267    POP              r8
268%elif ARCH_X86_32
269    POP              r6
270%undef maxd
271%undef hd
272%endif
273.ar0:
274    RET
275
276.ar2:
277%if ARCH_X86_32
278    ALLOC_STACK -16*8
279%endif
280    DEFINE_ARGS buf, fg_data, bdmax, shift
281    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
282    movd             m0, [base+round_vals-12+shiftq*2]
283    pshuflw          m0, m0, q0000
284    movu             m6, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-11
285    pxor             m2, m2
286    punpcklwd        m0, m2
287    pcmpgtb          m2, m6
288    punpckhbw        m3, m6, m2
289    punpcklbw        m6, m2
290    pshufd           m2, m6, q3333
291    pshufd           m1, m6, q2222
292    pshufd           m7, m6, q1111
293    pshufd           m6, m6, q0000
294    pshufd           m4, m3, q1111
295    pshufd           m3, m3, q0000
296%if ARCH_X86_64
297    SWAP              0, 12
298    SWAP              1, 8
299    SWAP              2, 9
300    SWAP              3, 10
301    SWAP              4, 11
302%else
303%define m12 [rsp+0*16]
304%define m8 [rsp+1*16]
305%define m9 [rsp+2*16]
306%define m10 [rsp+3*16]
307%define m11 [rsp+4*16]
308    mova            m12, m0
309    mova             m8, m1
310    mova             m9, m2
311    mova            m10, m3
312    mova            m11, m4
313    mov          bdmaxd, bdmaxm
314%endif
315    sar          bdmaxd, 1
316    SPLATW           m0, bdmaxd                             ; max_grain
317    pcmpeqw          m1, m1
318%if !cpuflag(sse4)
319    pcmpeqw          m2, m2
320    psrldq           m2, 14
321    pslldq           m2, 2
322    pxor             m2, m1
323%endif
324    pxor             m1, m0                                 ; min_grain
325%if ARCH_X86_64
326    SWAP              0, 13
327    SWAP              1, 14
328    SWAP              2, 15
329%else
330%define m13 [rsp+5*16]
331%define m14 [rsp+6*16]
332    mova            m13, m0
333    mova            m14, m1
334%if !cpuflag(sse4)
335%define m15 [rsp+7*16]
336    mova            m15, m2
337%endif
338%endif
339    sub            bufq, 2*(82*73-(82*3+79))
340    DEFINE_ARGS buf, fg_data, h, x
341    mov              hd, 70
342.y_loop_ar2:
343    mov              xq, -76
344
345.x_loop_ar2:
346    movu             m0, [bufq+xq*2-82*4-4]     ; y=-2,x=[-2,+5]
347    movu             m1, [bufq+xq*2-82*2-4]     ; y=-1,x=[-2,+5]
348    psrldq           m2, m0, 2
349    psrldq           m3, m0, 4
350    psrldq           m4, m0, 6
351    psrldq           m5, m0, 8
352    punpcklwd        m0, m2
353    punpcklwd        m3, m4
354    punpcklwd        m5, m1
355    psrldq           m2, m1, 2
356    psrldq           m4, m1, 4
357    punpcklwd        m2, m4
358    psrldq           m4, m1, 6
359    psrldq           m1, 8
360    punpcklwd        m4, m1
361    pmaddwd          m0, m6
362    pmaddwd          m3, m7
363    pmaddwd          m5, m8
364    pmaddwd          m2, m9
365    pmaddwd          m4, m10
366    paddd            m0, m3
367    paddd            m5, m2
368    paddd            m0, m4
369    paddd            m0, m5                     ; accumulated top 2 rows
370    paddd            m0, m12
371
372    movu             m1, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
373    pshufd           m4, m1, q3321
374    pxor             m2, m2
375    pcmpgtw          m2, m4
376    punpcklwd        m4, m2                 ; in dwords, y=0,x=[0,3]
377.x_loop_ar2_inner:
378    pmaddwd          m2, m1, m11
379    paddd            m2, m0
380    psrldq           m0, 4                  ; shift top to next pixel
381    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
382    paddd            m2, m4
383    packssdw         m2, m2
384    pminsw           m2, m13
385    pmaxsw           m2, m14
386    psrldq           m4, 4
387    pslldq           m2, 2
388    psrldq           m1, 2
389%if cpuflag(sse4)
390    pblendw          m1, m2, 00000010b
391%else
392    pand             m1, m15
393    pandn            m3, m15, m2
394    por              m1, m3
395%endif
396    ; overwrite previous pixel, this should be ok
397    movd  [bufq+xq*2-2], m1
398    inc              xq
399    jz .x_loop_ar2_end
400    test             xq, 3
401    jnz .x_loop_ar2_inner
402    jmp .x_loop_ar2
403
404.x_loop_ar2_end:
405    add            bufq, 82*2
406    dec              hd
407    jg .y_loop_ar2
408%if ARCH_X86_32
409%undef m8
410%undef m9
411%undef m10
412%undef m11
413%undef m12
414%undef m13
415%undef m14
416%undef m15
417%endif
418    RET
419
420.ar3:
421    DEFINE_ARGS buf, fg_data, bdmax, shift
422%if WIN64
423    mov              r6, rsp
424    and             rsp, ~15
425    sub             rsp, 64
426    %define         tmp  rsp
427%elif ARCH_X86_64
428    %define         tmp  rsp+stack_offset-72
429%else
430    ALLOC_STACK  -16*12
431    %define         tmp  rsp
432    mov          bdmaxd, bdmaxm
433%endif
434    sar          bdmaxd, 1
435    SPLATW           m7, bdmaxd                                 ; max_grain
436    pcmpeqw          m6, m6
437%if !cpuflag(sse4)
438    pcmpeqw          m4, m4
439    psrldq           m4, 14
440    pslldq           m4, 4
441    pxor             m4, m6
442%endif
443    pxor             m6, m7                                    ; min_grain
444    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
445
446%if ARCH_X86_64
447    SWAP              6, 14
448    SWAP              7, 15
449%else
450%define m14 [rsp+10*16]
451%define m15 [esp+11*16]
452    mova            m14, m6
453    mova            m15, m7
454%endif
455
456    ; build cf0-1 until 18-19 in m5-12 and r0/1
457    pxor             m1, m1
458    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]       ; cf0-15
459    pcmpgtb          m1, m0
460    punpckhbw        m2, m0, m1
461    punpcklbw        m0, m1
462
463%if cpuflag(sse4)
464    pshufd           m4, m2, q3333
465%else
466    pshufd           m5, m2, q3333
467    mova       [tmp+48], m5
468%endif
469    pshufd           m3, m2, q2222
470    pshufd           m1, m2, q0000
471    pshufd           m2, m2, q1111
472    pshufd           m7, m0, q2222
473    pshufd           m6, m0, q1111
474    pshufd           m5, m0, q0000
475    pshufd           m0, m0, q3333
476
477%if ARCH_X86_64
478    SWAP              0, 8
479    SWAP              1, 9
480    SWAP              2, 10
481    SWAP              3, 11
482    SWAP              4, 12
483%else
484%define m8 [rsp+4*16]
485%define m9 [esp+5*16]
486%define m10 [rsp+6*16]
487%define m11 [esp+7*16]
488%define m12 [rsp+8*16]
489    mova             m8, m0
490    mova             m9, m1
491    mova            m10, m2
492    mova            m11, m3
493    mova            m12, m4
494%endif
495
496    ; build cf20,round in r2
497    ; build cf21-23,round*2 in m13
498    pxor             m1, m1
499    movq             m0, [fg_dataq+FGData.ar_coeffs_y+16]       ; cf16-23
500    pcmpgtb          m1, m0
501    punpcklbw        m0, m1
502    pshufd           m1, m0, q0000
503    pshufd           m2, m0, q1111
504    mova       [tmp+ 0], m1
505    mova       [tmp+16], m2
506    psrldq           m3, m0, 10
507    pinsrw           m3, [base+round_vals+shiftq*2-10], 3
508
509%if ARCH_X86_64
510    SWAP              3, 13
511%else
512%define m13 [esp+9*16]
513    mova            m13, m3
514%endif
515
516    pinsrw           m0, [base+round_vals+shiftq*2-12], 5
517    pshufd           m3, m0, q2222
518    mova       [tmp+32], m3
519
520    DEFINE_ARGS buf, fg_data, h, x
521    sub            bufq, 2*(82*73-(82*3+79))
522    mov              hd, 70
523.y_loop_ar3:
524    mov              xq, -76
525
526.x_loop_ar3:
527    movu             m0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
528    movd             m1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+6]
529    palignr          m2, m1, m0, 2                  ; y=-3,x=[-2,+5]
530    palignr          m1, m1, m0, 12                 ; y=-3,x=[+3,+6]
531    punpckhwd        m3, m0, m2                     ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
532    punpcklwd        m0, m2                         ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
533    shufps           m2, m0, m3, q1032              ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
534
535    pmaddwd          m0, m5
536    pmaddwd          m2, m6
537    pmaddwd          m3, m7
538    paddd            m0, m2
539    paddd            m0, m3
540    ; m0 = top line first 6 multiplied by cf, m1 = top line last entry
541
542    movu             m2, [bufq+xq*2-82*4-6+ 0]      ; y=-2,x=[-3,+4]
543    movd             m3, [bufq+xq*2-82*4-6+16]      ; y=-2,x=[+5,+6]
544    punpcklwd        m1, m2                         ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
545    palignr          m4, m3, m2, 2                  ; y=-3,x=[-2,+5]
546    palignr          m3, m3, m2, 4                  ; y=-3,x=[-1,+6]
547    punpckhwd        m2, m4, m3                     ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
548    punpcklwd        m4, m3                         ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
549    shufps           m3, m4, m2, q1032              ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
550
551    pmaddwd          m1, m8
552    pmaddwd          m4, m9
553    pmaddwd          m3, m10
554    pmaddwd          m2, m11
555    paddd            m1, m4
556    paddd            m3, m2
557    paddd            m0, m1
558    paddd            m0, m3
559    ; m0 = top 2 lines multiplied by cf
560
561    movu             m1, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
562    movd             m2, [bufq+xq*2-82*2-6+16]      ; y=-1,x=[+5,+6]
563    palignr          m3, m2, m1, 2                  ; y=-1,x=[-2,+5]
564    palignr          m2, m2, m1, 12                 ; y=-1,x=[+3,+6]
565    punpckhwd        m4, m1, m3                     ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
566    punpcklwd        m1, m3                         ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
567    shufps           m3, m1, m4, q1032              ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
568    punpcklwd        m2, [base+pw_1]
569
570%if cpuflag(sse4)
571    pmaddwd          m1, m12
572%else
573    pmaddwd          m1, [tmp+48]
574%endif
575    pmaddwd          m3, [tmp+ 0]
576    pmaddwd          m4, [tmp+16]
577    pmaddwd          m2, [tmp+32]
578    paddd            m1, m3
579    paddd            m4, m2
580    paddd            m0, m1
581    paddd            m0, m4
582    ; m0 = top 3 lines multiplied by cf plus rounding for downshift
583
584    movu             m1, [bufq+xq*2-6]      ; y=0,x=[-3,+4]
585.x_loop_ar3_inner:
586    pmaddwd          m2, m1, m13
587    pshufd           m3, m2, q1111
588    paddd            m2, m3                 ; left+cur
589    paddd            m2, m0                 ; add top
590    psrldq           m0, 4
591    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
592    packssdw         m2, m2
593    pminsw           m2, m15
594    pmaxsw           m2, m14
595    pslldq           m2, 4
596    psrldq           m1, 2
597%if cpuflag(sse4)
598    pblendw          m1, m2, 00000100b
599%else
600    pand             m1, m12
601    pandn            m3, m12, m2
602    por              m1, m3
603%endif
604    ; overwrite a couple of pixels, should be ok
605    movq  [bufq+xq*2-4], m1
606    inc              xq
607    jz .x_loop_ar3_end
608    test             xq, 3
609    jnz .x_loop_ar3_inner
610    jmp .x_loop_ar3
611
612.x_loop_ar3_end:
613    add            bufq, 82*2
614    dec              hd
615    jg .y_loop_ar3
616%if WIN64
617    mov             rsp, r6
618%elif ARCH_X86_32
619%undef m8
620%undef m9
621%undef m10
622%undef m11
623%undef m12
624%undef m13
625%undef m14
626%undef m15
627%endif
628    RET
629
630%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
631INIT_XMM ssse3
632%if ARCH_X86_64
633cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg
634%define base r8-pb_mask
635    lea              r8, [pb_mask]
636    movifnidn    bdmaxd, bdmaxm
637    lea             r6d, [bdmaxq+1]
638%else
639cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
640%define base r2-$$
641    LEA              r2, $$
642    mov        fg_dataq, r2m
643    mov             r6d, r4m
644    inc             r6d
645%endif
646    movq             m1, [base+rnd_next_upperbit_mask]
647    movq             m4, [base+mul_bits]
648    movq             m7, [base+hmul_bits]
649    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
650    shr             r6d, 11             ; 0 for 10bpc, 2 for 12bpc
651    sub              r5, r6
652    SPLATW           m6, [base+round+r5*2-2]
653    mova             m5, [base+pb_mask]
654    SPLATW           m0, [fg_dataq+FGData.seed]
655%if ARCH_X86_64
656    SPLATW           m2, [base+pw_seed_xor+uvq*4]
657%else
658    mov             r5d, r3m
659    SPLATW           m2, [base+pw_seed_xor+r5*4]
660%endif
661    pxor             m0, m2
662%if ARCH_X86_64
663    lea              r6, [gaussian_sequence]
664%endif
665%if %2
666    mov              hd, 73-35*%3
667    add            bufq, 44*2
668.loop_y:
669    mov              xq, -44
670%else
671    mov              xq, -82*73
672    add            bufq, 82*73*2
673%endif
674.loop_x:
675    pand             m2, m0, m1
676    psrlw            m3, m2, 10
677    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
678    pmullw           m2, m4             ; bits 0x0f00 are set
679    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
680    psllq            m2, m3, 30
681    por              m2, m3
682    psllq            m3, m2, 15
683    por              m2, m3             ; aggregate each bit into next seed's high bit
684    pmulhuw          m3, m0, m7
685    por              m2, m3             ; 4 next output seeds
686    pshuflw          m0, m2, q3333
687    psrlw            m2, 5
688%if ARCH_X86_64
689    vpgatherdw       m3, m2, r6, r9, r10, 4, 2
690%else
691    vpgatherdw       m3, m2, base+gaussian_sequence, r5, r6, 4, 2
692%endif
693    paddw            m3, m3             ; otherwise bpc=12 w/ grain_scale_shift=0
694                                        ; shifts by 0, which pmulhrsw does not support
695    pmulhrsw         m3, m6
696    movq    [bufq+xq*2], m3
697    add              xq, 4
698    jl .loop_x
699%if %2
700    add            bufq, 82*2
701    dec              hd
702    jg .loop_y
703%endif
704
705    ; auto-regression code
706    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
707    movsxd           r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4]
708    lea              r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table]
709    jmp              r5
710
711.ar0:
712%if ARCH_X86_64
713    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
714%else
715    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
716    ALLOC_STACK  -16*2
717    mov           bufyq, r1m
718    mov             uvd, r3m
719%endif
720    imul            uvd, 28
721    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
722    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
723    SPLATW           m3, [base+hmul_bits+shiftq*2-10]
724%if ARCH_X86_64
725    sar          bdmaxd, 1
726    SPLATW           m1, bdmaxd                     ; max_gain
727%else
728    SPLATW           m1, r4m
729    psraw            m1, 1
730%endif
731    pcmpeqw          m7, m7
732    pxor             m7, m1                         ; min_grain
733%if ARCH_X86_64
734    SWAP              1, 14
735    DEFINE_ARGS buf, bufy, h, x
736%else
737%define m14 [rsp+0*16]
738    mova            m14, m1
739    DEFINE_ARGS buf, bufy, pic_reg, h, x
740%endif
741    pxor             m5, m5
742    pcmpgtb          m5, m4
743    punpcklbw        m4, m5
744%if %2
745    SPLATW           m6, [base+hmul_bits+2+%3*2]
746%endif
747    SPLATW           m4, m4
748    pxor             m5, m5
749%if %2
750%if !cpuflag(sse4)
751    pcmpeqw          m2, m2
752    pslldq           m2, 12
753%if ARCH_X86_64
754    SWAP              2, 12
755%else
756%define m12 [rsp+1*16]
757    mova            m12, m2
758%endif
759%endif
760%endif
761%if %2
762    sub            bufq, 2*(82*(73-35*%3)+82-(82*3+41))
763%else
764    sub            bufq, 2*(82*70-3)
765%endif
766    add           bufyq, 2*(3+82*3)
767    mov              hd, 70-35*%3
768.y_loop_ar0:
769    ; first 32 pixels
770    xor              xd, xd
771.x_loop_ar0:
772    movu             m0, [bufyq+xq*(2<<%2)]
773%if %2
774%if %3
775    movu             m2, [bufyq+xq*4+82*2]
776    paddw            m0, m2
777%endif
778    movu             m1, [bufyq+xq*4     +16]
779%if %3
780    movu             m2, [bufyq+xq*4+82*2+16]
781    paddw            m1, m2
782%endif
783    phaddw           m0, m1
784    pmulhrsw         m0, m6
785%endif
786    punpckhwd        m1, m0, m5
787    punpcklwd        m0, m5
788    REPX {pmaddwd x, m4}, m0, m1
789    REPX {psrad x, 5}, m0, m1
790    packssdw         m0, m1
791    pmulhrsw         m0, m3
792    movu             m1, [bufq+xq*2]
793    paddw            m0, m1
794    pminsw           m0, m14
795    pmaxsw           m0, m7
796    cmp              xd, 72-40*%2
797    je .end
798    movu    [bufq+xq*2], m0
799    add              xd, 8
800    jmp .x_loop_ar0
801
802    ; last 6/4 pixels
803.end:
804%if %2
805%if cpuflag(sse4)
806    pblendw          m0, m1, 11000000b
807%else
808    pand             m1, m12
809    pandn            m2, m12, m0
810    por              m0, m1, m2
811%endif
812    movu    [bufq+xq*2], m0
813%else
814    movq    [bufq+xq*2], m0
815%endif
816
817    add            bufq, 82*2
818    add           bufyq, 82*(2<<%3)
819    dec              hd
820    jg .y_loop_ar0
821%if ARCH_X86_32
822%undef m12
823%undef m14
824%endif
825    RET
826
827.ar1:
828%if ARCH_X86_64
829    DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x
830%else
831    RESET_STACK_STATE
832    DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3
833    mov           bufyq, r1m
834    mov             uvd, r3m
835%endif
836    imul            uvd, 28
837    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
838    movq             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
839%if WIN64
840    DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0
841%if %2
842    lea            bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))]
843%else
844    lea            bufq, [r0-2*(82*69+3)]
845%endif
846%else
847%if ARCH_X86_64
848    DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0
849%else
850    DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3
851%define hd dword r1m
852%define mind dword r3m
853%define maxd dword r4m
854%endif
855%if %2
856    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
857%else
858    sub            bufq, 2*(82*69+3)
859%endif
860%endif
861%if ARCH_X86_64
862    mov          shiftd, [r2+FGData.ar_coeff_shift]
863%else
864    mov          shiftd, [r3+FGData.ar_coeff_shift]
865%endif
866    pxor             m5, m5
867    pcmpgtb          m5, m4
868    punpcklbw        m4, m5                 ; cf0-4 in words
869    pshuflw          m4, m4, q2100
870    psrldq           m4, 2                  ; cf0-3,4 in words
871    pshufd           m5, m4, q1111
872    pshufd           m4, m4, q0000
873    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
874    pxor             m6, m6
875    punpcklwd        m3, m6
876%if %2
877    SPLATW           m6, [base+hmul_bits+2+%3*2]
878%endif
879    SPLATD           m3, m3
880    add           bufyq, 2*(79+82*3)
881    mov              hd, 70-35*%3
882    sar            maxd, 1
883%if ARCH_X86_64
884    mov            mind, maxd
885    xor            mind, -1
886%else
887    DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3
888    mov              r2, maxd
889    xor              r2, -1
890    mov            mind, r2
891%endif
892.y_loop_ar1:
893    mov              xq, -(76>>%2)
894    movsx         val3d, word [bufq+xq*2-2]
895.x_loop_ar1:
896    movu             m0, [bufq+xq*2-82*2-2] ; top/left
897%if %2
898    movu             m7, [bufyq+xq*4]
899%if %3
900    movu             m1, [bufyq+xq*4+82*2]
901    phaddw           m7, m1
902%else
903    phaddw           m7, m7
904%endif
905%else
906    movq             m7, [bufyq+xq*2]
907%endif
908    psrldq           m2, m0, 2              ; top
909    psrldq           m1, m0, 4              ; top/right
910    punpcklwd        m0, m2
911%if %2
912%if %3
913    pshufd           m2, m7, q3232
914    paddw            m7, m2
915%endif
916    pmulhrsw         m7, m6
917%endif
918    punpcklwd        m1, m7
919    pmaddwd          m0, m4
920    pmaddwd          m1, m5
921    paddd            m0, m1
922    paddd            m0, m3
923.x_loop_ar1_inner:
924    movd          val0d, m0
925    psrldq           m0, 4
926    imul          val3d, cf3d
927    add           val3d, val0d
928    sar           val3d, shiftb
929    movsx         val0d, word [bufq+xq*2]
930    add           val3d, val0d
931    cmp           val3d, maxd
932    cmovg         val3d, maxd
933    cmp           val3d, mind
934    cmovl         val3d, mind
935    mov word [bufq+xq*2], val3w
936    ; keep val3d in-place as left for next x iteration
937    inc              xq
938    jz .x_loop_ar1_end
939    test             xq, 3
940    jnz .x_loop_ar1_inner
941    jmp .x_loop_ar1
942
943.x_loop_ar1_end:
944    add            bufq, 82*2
945    add           bufyq, 82*2<<%3
946    dec              hd
947    jg .y_loop_ar1
948%if ARCH_X86_32
949%undef maxd
950%undef mind
951%undef hd
952%endif
953    RET
954
955.ar2:
956%if ARCH_X86_64
957    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
958%else
959    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
960    ALLOC_STACK  -16*8
961    mov           bufyq, r1m
962    mov             uvd, r3m
963%endif
964    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
965    imul            uvd, 28
966%if ARCH_X86_64
967    sar          bdmaxd, 1
968    SPLATW           m5, bdmaxd                 ; max_grain
969%else
970    SPLATW           m5, r4m
971    psraw            m5, 1
972%endif
973    pcmpeqw          m6, m6
974%if !cpuflag(sse4)
975    pcmpeqw          m7, m7
976    psrldq           m7, 14
977    pslldq           m7, 2
978    pxor             m7, m6
979%endif
980    pxor             m6, m5                    ; min_grain
981%if %2 && cpuflag(sse4)
982    SPLATW           m7, [base+hmul_bits+2+%3*2]
983%endif
984
985%if ARCH_X86_64
986    SWAP              5, 13
987    SWAP              6, 14
988    SWAP              7, 15
989%else
990%define m13 [rsp+5*16]
991%define m14 [rsp+6*16]
992%define m15 [rsp+7*16]
993    mova            m13, m5
994    mova            m14, m6
995    mova            m15, m7
996%endif
997
998    ; coef values
999    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]
1000    pxor             m1, m1
1001    pcmpgtb          m1, m0
1002    punpckhbw        m2, m0, m1
1003    punpcklbw        m0, m1
1004    pinsrw           m2, [base+round_vals-12+shiftq*2], 5
1005
1006    pshufd           m6, m0, q0000
1007    pshufd           m7, m0, q1111
1008    pshufd           m1, m0, q3333
1009    pshufd           m0, m0, q2222
1010    pshufd           m3, m2, q1111
1011    pshufd           m4, m2, q2222
1012    pshufd           m2, m2, q0000
1013
1014%if ARCH_X86_64
1015    SWAP              0, 8
1016    SWAP              1, 9
1017    SWAP              2, 10
1018    SWAP              3, 11
1019    SWAP              4, 12
1020%else
1021%define m8 [rsp+0*16]
1022%define m9 [rsp+1*16]
1023%define m10 [rsp+2*16]
1024%define m11 [rsp+3*16]
1025%define m12 [rsp+4*16]
1026    mova             m8, m0
1027    mova             m9, m1
1028    mova            m10, m2
1029    mova            m11, m3
1030    mova            m12, m4
1031%endif
1032
1033%if ARCH_X86_64
1034    DEFINE_ARGS buf, bufy, fg_data, h, x
1035%else
1036    DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
1037%endif
1038%if %2
1039    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
1040%else
1041    sub            bufq, 2*(82*69+3)
1042%endif
1043    add           bufyq, 2*(79+82*3)
1044    mov              hd, 70-35*%3
1045.y_loop_ar2:
1046    mov              xq, -(76>>%2)
1047
1048.x_loop_ar2:
1049    movu             m0, [bufq+xq*2-82*4-4]     ; y=-2,x=[-2,+5]
1050    movu             m5, [bufq+xq*2-82*2-4]     ; y=-1,x=[-2,+5]
1051    psrldq           m4, m0, 2                  ; y=-2,x=[-1,+5]
1052    psrldq           m1, m0, 4                  ; y=-2,x=[-0,+5]
1053    psrldq           m3, m0, 6                  ; y=-2,x=[+1,+5]
1054    psrldq           m2, m0, 8                  ; y=-2,x=[+2,+5]
1055    punpcklwd        m0, m4                     ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
1056    punpcklwd        m1, m3                     ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
1057    punpcklwd        m2, m5                     ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1]
1058    pmaddwd          m0, m6
1059    pmaddwd          m1, m7
1060    pmaddwd          m2, m8
1061    paddd            m0, m1
1062    paddd            m0, m2
1063    psrldq           m3, m5, 2                  ; y=-1,x=[-1,+5]
1064    psrldq           m1, m5, 4                  ; y=-1,x=[-0,+5]
1065    psrldq           m4, m5, 6                  ; y=-1,x=[+1,+5]
1066    psrldq           m2, m5, 8                  ; y=-1,x=[+2,+5]
1067    punpcklwd        m3, m1
1068    punpcklwd        m4, m2
1069    pmaddwd          m3, m9
1070    pmaddwd          m4, m10
1071    paddd            m3, m4
1072    paddd            m0, m3
1073
1074    ; luma component & rounding
1075%if %2
1076    movu             m1, [bufyq+xq*4]
1077%if %3
1078    movu             m2, [bufyq+xq*4+82*2]
1079    phaddw           m1, m2
1080    pshufd           m2, m1, q3232
1081    paddw            m1, m2
1082%else
1083    phaddw           m1, m1
1084%endif
1085%if cpuflag(sse4)
1086    pmulhrsw         m1, m15
1087%elif %3
1088    pmulhrsw         m1, [base+pw_8192]
1089%else
1090    pmulhrsw         m1, [base+pw_16384]
1091%endif
1092%else
1093    movq             m1, [bufyq+xq*2]
1094%endif
1095    punpcklwd        m1, [base+pw_1]
1096    pmaddwd          m1, m12
1097    paddd            m0, m1
1098
1099    movu             m1, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
1100    pshufd           m2, m1, q3321
1101    pxor             m3, m3
1102    pcmpgtw          m3, m2
1103    punpcklwd        m2, m3                 ; y=0,x=[0,3] in dword
1104.x_loop_ar2_inner:
1105    pmaddwd          m3, m1, m11
1106    paddd            m3, m0
1107    psrldq           m0, 4                  ; shift top to next pixel
1108    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
1109    ; we do not need to packssdw since we only care about one value
1110    paddd            m3, m2
1111    packssdw         m3, m3
1112    pminsw           m3, m13
1113    pmaxsw           m3, m14
1114    psrldq           m1, 2
1115    pslldq           m3, 2
1116    psrldq           m2, 4
1117%if cpuflag(sse4)
1118    pblendw          m1, m3, 00000010b
1119%else
1120    pand             m1, m15
1121    pandn            m4, m15, m3
1122    por              m1, m4
1123%endif
1124    ; overwrite previous pixel, should be ok
1125    movd  [bufq+xq*2-2], m1
1126    inc              xq
1127    jz .x_loop_ar2_end
1128    test             xq, 3
1129    jnz .x_loop_ar2_inner
1130    jmp .x_loop_ar2
1131
1132.x_loop_ar2_end:
1133    add            bufq, 82*2
1134    add           bufyq, 82*2<<%3
1135    dec              hd
1136    jg .y_loop_ar2
1137%if ARCH_X86_32
1138%undef m13
1139%undef m14
1140%undef m15
1141%endif
1142    RET
1143
1144.ar3:
1145%if ARCH_X86_64
1146    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
1147%if WIN64
1148    mov              r6, rsp
1149    and             rsp, ~15
1150    sub             rsp, 96
1151    %define         tmp  rsp
1152%else
1153    %define         tmp  rsp+stack_offset-120
1154%endif
1155%else
1156    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
1157    ALLOC_STACK  -16*14
1158    mov           bufyq, r1m
1159    mov             uvd, r3m
1160    %define         tmp  rsp
1161%endif
1162    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
1163    imul            uvd, 28
1164    SPLATW           m4, [base+round_vals-12+shiftq*2]
1165    pxor             m5, m5
1166    pcmpgtw          m5, m4
1167    punpcklwd        m4, m5
1168%if ARCH_X86_64
1169    sar          bdmaxd, 1
1170    SPLATW           m6, bdmaxd                 ; max_grain
1171%else
1172    SPLATW           m6, r4m
1173    psraw            m6, 1
1174%endif
1175    pcmpeqw          m7, m7
1176%if !cpuflag(sse4)
1177    pcmpeqw          m3, m3
1178    psrldq           m3, 14
1179    pslldq           m3, 4
1180    pxor             m3, m7
1181%endif
1182    pxor             m7, m6                     ; min_grain
1183%if %2 && cpuflag(sse4)
1184    SPLATW           m3, [base+hmul_bits+2+%3*2]
1185%endif
1186
1187%if ARCH_X86_64
1188    SWAP              3, 11
1189    SWAP              4, 12
1190    SWAP              6, 14
1191    SWAP              7, 15
1192%else
1193%define m11 [rsp+ 9*16]
1194%define m12 [rsp+10*16]
1195%define m14 [rsp+12*16]
1196%define m15 [rsp+13*16]
1197    mova            m11, m3
1198    mova            m12, m4
1199    mova            m14, m6
1200    mova            m15, m7
1201%endif
1202
1203    ; cf from y=-3,x=-3 until y=-3,x=-2
1204    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
1205    pxor             m1, m1
1206    pcmpgtb          m1, m0
1207    punpckhbw        m2, m0, m1
1208    punpcklbw        m0, m1
1209    pshufd           m1, m0, q0000
1210    pshufd           m3, m0, q1111
1211    pshufd           m4, m0, q2222
1212    pshufd           m0, m0, q3333
1213    pshufd           m5, m2, q0000
1214    pshufd           m6, m2, q1111
1215    mova     [tmp+16*0], m1
1216    mova     [tmp+16*1], m3
1217    mova     [tmp+16*2], m4
1218    mova     [tmp+16*3], m0
1219    mova     [tmp+16*4], m5
1220    mova     [tmp+16*5], m6
1221    pshufd           m6, m2, q2222
1222    pshufd           m7, m2, q3333
1223
1224    ; cf from y=-1,x=-1 to y=0,x=-1 + luma component
1225    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]
1226    pxor             m1, m1
1227    pcmpgtb          m1, m0
1228    punpckhbw        m2, m0, m1                 ; luma
1229    punpcklbw        m0, m1
1230    pshufd           m3, m0, q3232
1231    psrldq           m5, m0, 10
1232    ; y=0,x=[-3 to -1] + "1.0" for current pixel
1233    pinsrw           m5, [base+round_vals-10+shiftq*2], 3
1234    ; y=-1,x=[-1 to +2]
1235    pshufd           m1, m0, q0000
1236    pshufd           m0, m0, q1111
1237    ; y=-1,x=+3 + luma
1238    punpcklwd        m3, m2
1239    pshufd           m3, m3, q0000
1240
1241%if ARCH_X86_64
1242    SWAP              1, 8
1243    SWAP              0, 9
1244    SWAP              3, 10
1245    SWAP              5, 13
1246    DEFINE_ARGS buf, bufy, fg_data, h, x
1247%else
1248%define m8  [rsp+ 6*16]
1249%define m9  [rsp+ 7*16]
1250%define m10 [rsp+ 8*16]
1251%define m13 [rsp+11*16]
1252    mova             m8, m1
1253    mova             m9, m0
1254    mova            m10, m3
1255    mova            m13, m5
1256    DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
1257%endif
1258%if %2
1259    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
1260%else
1261    sub            bufq, 2*(82*69+3)
1262%endif
1263    add           bufyq, 2*(79+82*3)
1264    mov              hd, 70-35*%3
1265.y_loop_ar3:
1266    mov              xq, -(76>>%2)
1267
1268.x_loop_ar3:
1269    ; first line
1270    movu             m0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
1271    movd             m1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+6]
1272    palignr          m2, m1, m0, 2                  ; y=-3,x=[-2,+5]
1273    palignr          m1, m1, m0, 12                 ; y=-3,x=[+3,+6]
1274    punpckhwd        m3, m0, m2                     ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
1275    punpcklwd        m0, m2                         ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
1276    shufps           m2, m0, m3, q1032              ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
1277
1278    pmaddwd          m0, [tmp+0*16]
1279    pmaddwd          m2, [tmp+1*16]
1280    pmaddwd          m3, [tmp+2*16]
1281    paddd            m0, m2
1282    paddd            m0, m3                         ; first 6 x of top y
1283
1284    ; second line [m0/1 are busy]
1285    movu             m2, [bufq+xq*2-82*4-6+ 0]      ; y=-2,x=[-3,+4]
1286    movd             m3, [bufq+xq*2-82*4-6+16]      ; y=-2,x=[+5,+6]
1287    punpcklwd        m1, m2                         ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
1288    palignr          m4, m3, m2, 2                  ; y=-2,x=[-2,+5]
1289    palignr          m3, m3, m2, 4                  ; y=-2,x=[-2,+5]
1290    punpckhwd        m5, m4, m3                     ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
1291    punpcklwd        m4, m3                         ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
1292    shufps           m3, m4, m5, q1032              ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
1293    pmaddwd          m1, [tmp+3*16]
1294    pmaddwd          m4, [tmp+4*16]
1295    pmaddwd          m3, [tmp+5*16]
1296    pmaddwd          m5, m6
1297    paddd            m1, m4
1298    paddd            m3, m5
1299    paddd            m0, m1
1300    paddd            m0, m3                         ; top 2 lines
1301
1302    ; third line [m0 is busy] & luma + round
1303    movu             m1, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
1304    movd             m2, [bufq+xq*2-82*2-6+16]      ; y=-1,x=[+5,+6]
1305%if %2
1306    movu             m5, [bufyq+xq*4]
1307%if %3
1308    movu             m4, [bufyq+xq*4+82*2]
1309    phaddw           m5, m4
1310%else
1311    phaddw           m5, m5
1312%endif
1313%else
1314    movq             m5, [bufyq+xq*2]
1315%endif
1316    palignr          m3, m2, m1, 2                  ; y=-1,x=[-2,+5]
1317    palignr          m2, m2, m1, 12                 ; y=-1,x=[+3,+6]
1318%if %3
1319    pshufd           m4, m5, q3232
1320    paddw            m5, m4
1321%endif
1322%if %2
1323%if cpuflag(sse4)
1324    pmulhrsw         m5, m11
1325%elif %3
1326    pmulhrsw         m5, [base+pw_8192]
1327%else
1328    pmulhrsw         m5, [base+pw_16384]
1329%endif
1330%endif
1331    punpckhwd        m4, m1, m3                     ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
1332    punpcklwd        m1, m3                         ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
1333    shufps           m3, m1, m4, q1032              ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
1334    punpcklwd        m2, m5
1335    pmaddwd          m1, m7
1336    pmaddwd          m3, m8
1337    pmaddwd          m4, m9
1338    pmaddwd          m2, m10
1339    paddd            m1, m3
1340    paddd            m4, m2
1341    paddd            m0, m12                        ; += round
1342    paddd            m1, m4
1343    paddd            m0, m1
1344
1345    movu             m1, [bufq+xq*2-6]      ; y=0,x=[-3,+4]
1346.x_loop_ar3_inner:
1347    pmaddwd          m2, m1, m13
1348    pshufd           m3, m2, q1111
1349    paddd            m2, m3                 ; left+cur
1350    paddd            m2, m0                 ; add top
1351    psrldq           m0, 4
1352    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
1353    packssdw         m2, m2
1354    pminsw           m2, m14
1355    pmaxsw           m2, m15
1356    pslldq           m2, 4
1357    psrldq           m1, 2
1358%if cpuflag(sse4)
1359    pblendw          m1, m2, 00000100b
1360%else
1361    pand             m1, m11
1362    pandn            m3, m11, m2
1363    por              m1, m3
1364%endif
1365    ; overwrite previous pixels, should be ok
1366    movq  [bufq+xq*2-4], m1
1367    inc              xq
1368    jz .x_loop_ar3_end
1369    test             xq, 3
1370    jnz .x_loop_ar3_inner
1371    jmp .x_loop_ar3
1372
1373.x_loop_ar3_end:
1374    add            bufq, 82*2
1375    add           bufyq, 82*2<<%3
1376    dec              hd
1377    jg .y_loop_ar3
1378%if WIN64
1379    mov             rsp, r6
1380%elif ARCH_X86_32
1381%undef m8
1382%undef m9
1383%undef m10
1384%undef m11
1385%undef m12
1386%undef m13
1387%undef m14
1388%undef m15
1389%endif
1390    RET
1391%endmacro
1392
1393generate_grain_uv_fn 420, 1, 1
1394generate_grain_uv_fn 422, 1, 0
1395generate_grain_uv_fn 444, 0, 0
1396
1397%macro SCRATCH 3
1398%if ARCH_X86_32
1399    mova [rsp+%3*mmsize], m%1
1400%define m%2 [rsp+%3*mmsize]
1401%else
1402    SWAP             %1, %2
1403%endif
1404%endmacro
1405
1406INIT_XMM ssse3
1407%if ARCH_X86_32
1408%if STACK_ALIGNMENT < mmsize
1409cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \
1410        dst, src, scaling, unused1, fg_data, picptr, unused2
1411    ; copy stack arguments to new position post-alignment, so that we
1412    ; don't have to keep the old stack location in a separate register
1413    mov              r0, r0m
1414    mov              r1, r2m
1415    mov              r2, r4m
1416    mov              r3, r6m
1417    mov              r4, r7m
1418    mov              r5, r8m
1419
1420%define r0m [rsp+8*mmsize+ 3*gprsize]
1421%define r2m [rsp+8*mmsize+ 5*gprsize]
1422%define r4m [rsp+8*mmsize+ 7*gprsize]
1423%define r6m [rsp+8*mmsize+ 9*gprsize]
1424%define r7m [rsp+8*mmsize+10*gprsize]
1425%define r8m [rsp+8*mmsize+11*gprsize]
1426
1427    mov             r0m, r0
1428    mov             r2m, r1
1429    mov             r4m, r2
1430    mov             r6m, r3
1431    mov             r7m, r4
1432    mov             r8m, r5
1433%else
1434cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \
1435        dst, src, scaling, unused1, fg_data, picptr, unused2
1436%endif
1437    mov            srcq, srcm
1438    mov        scalingq, r5m
1439    mov        fg_dataq, r3m
1440%if STACK_ALIGNMENT < mmsize
1441    mov              r6, r9m
1442
1443%define r9m [rsp+8*mmsize+ 4*gprsize]
1444%define r3m [rsp+8*mmsize+ 6*gprsize]
1445%define r5m [rsp+8*mmsize+ 8*gprsize]
1446
1447    mov             r9m, r6
1448%endif
1449    LEA              r5, $$
1450%define base r5-$$
1451    mov             r5m, picptrq
1452%else
1453cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
1454    lea              r8, [pb_mask]
1455%define base r8-pb_mask
1456%endif
1457    mov             r6d, [fg_dataq+FGData.scaling_shift]
1458    SPLATW           m3, [base+mul_bits+r6*2-14]
1459    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1460%if ARCH_X86_32
1461    DECLARE_REG_TMP   0, 3
1462%else
1463    DECLARE_REG_TMP   9, 10
1464%endif
1465    mov             t0d, r9m        ; bdmax
1466    sar             t0d, 11         ; is_12bpc
1467    inc             t0d
1468    mov             t1d, r6d
1469    imul            t1d, t0d
1470    dec             t0d
1471    SPLATW           m5, [base+min+t1*2]
1472    lea             t0d, [t0d*3]
1473    lea             t0d, [r6d*2+t0d]
1474    SPLATW           m4, [base+max+t0*2]
1475    SPLATW           m2, r9m
1476
1477    pcmpeqw          m1, m1
1478    psraw            m7, m2, 1              ; max_grain
1479    pxor             m1, m7                 ; min_grain
1480    SPLATD           m6, [base+pd_16]
1481
1482    SCRATCH           1,  9, 0
1483    SCRATCH           2, 10, 1
1484    SCRATCH           3, 11, 2
1485    SCRATCH           4, 12, 3
1486    SCRATCH           5, 13, 4
1487    SCRATCH           6, 14, 5
1488    SCRATCH           7, 15, 6
1489
1490    mova             m6, [base+pw_27_17_17_27]   ; for horizontal filter
1491
1492%if ARCH_X86_32
1493    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2
1494    DECLARE_REG_TMP   0
1495%else
1496    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
1497                sby, see
1498    DECLARE_REG_TMP   7
1499%endif
1500
1501    mov            sbyd, r8m
1502    movzx           t0d, byte [fg_dataq+FGData.overlap_flag]
1503    test            t0d, t0d
1504    jz .no_vertical_overlap
1505    test           sbyd, sbyd
1506    jnz .vertical_overlap
1507.no_vertical_overlap:
1508    mov       dword r8m, t0d
1509
1510%if ARCH_X86_32
1511    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
1512    imul           seed, (173 << 24) | 37
1513%else
1514    imul           seed, sbyd, (173 << 24) | 37
1515%endif
1516    add            seed, (105 << 24) | 178
1517    rol            seed, 8
1518    movzx          seed, seew
1519    xor            seed, [fg_dataq+FGData.seed]
1520
1521%if ARCH_X86_32
1522    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1523
1524    mov             r3m, seed
1525    mov              wq, r4m
1526%else
1527    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1528                unused1, unused2, see, src_bak
1529%endif
1530
1531    lea        src_bakq, [srcq+wq*2]
1532    mov            r9mp, src_bakq
1533    neg              wq
1534    sub           dstmp, srcq
1535%if ARCH_X86_32
1536    mov             r4m, wq
1537%endif
1538
1539.loop_x:
1540%if ARCH_X86_32
1541    mov            seed, r3m
1542%endif
1543    mov             r6d, seed
1544    or             seed, 0xEFF4
1545    shr             r6d, 1
1546    test           seeb, seeh
1547    lea            seed, [r6+0x8000]
1548    cmovp          seed, r6d                ; updated seed
1549
1550%if ARCH_X86_32
1551    mov             r3m, seed
1552
1553    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1554
1555    mov           offxd, offyd
1556%else
1557    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1558                offx, offy, see, src_bak
1559
1560    mov           offyd, seed
1561    mov           offxd, seed
1562%endif
1563    ror           offyd, 8
1564    shr           offxd, 12
1565    and           offyd, 0xf
1566    imul          offyd, 164
1567    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1568
1569%if ARCH_X86_32
1570    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1571%else
1572    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1573                h, offxy, see, src_bak
1574%endif
1575
1576.loop_x_odd:
1577    movzx            hd, word r7m
1578    mov      grain_lutq, grain_lutmp
1579.loop_y:
1580    ; src
1581    pand             m0, m10, [srcq+ 0]
1582    pand             m1, m10, [srcq+16]          ; m0-1: src as word
1583
1584    ; scaling[src]
1585%if ARCH_X86_32
1586    vpgatherdw       m2, m0, scalingq-1, r0, r5, 8, 1, m4
1587    vpgatherdw       m3, m1, scalingq-1, r0, r5, 8, 1, m4
1588%else
1589    vpgatherdw       m2, m0, scalingq-1, r11, r13, 8, 1, m4
1590    vpgatherdw       m3, m1, scalingq-1, r11, r13, 8, 1, m4
1591%endif
1592    REPX   {psrlw x, 8}, m2, m3
1593
1594    ; grain = grain_lut[offy+y][offx+x]
1595    movu             m4, [grain_lutq+offxyq*2]
1596    movu             m5, [grain_lutq+offxyq*2+16]
1597
1598    ; noise = round2(scaling[src] * grain, scaling_shift)
1599    REPX {pmullw x, m11}, m2, m3
1600    pmulhrsw         m4, m2
1601    pmulhrsw         m5, m3
1602
1603    ; dst = clip_pixel(src, noise)
1604    paddw            m0, m4
1605    paddw            m1, m5
1606    pmaxsw           m0, m13
1607    pmaxsw           m1, m13
1608    pminsw           m0, m12
1609    pminsw           m1, m12
1610    movifnidn      dstq, dstmp
1611    mova [dstq+srcq+ 0], m0
1612    mova [dstq+srcq+16], m1
1613
1614    add            srcq, r2mp               ; src += stride
1615    add      grain_lutq, 82*2
1616    dec              hd
1617    jg .loop_y
1618
1619%if ARCH_X86_32
1620    add            r4mp, 16
1621%else
1622    add              wq, 16
1623%endif
1624    jge .end
1625%if ARCH_X86_32
1626    mov            srcq, r9mp
1627    add            srcq, r4mp
1628    add            srcq, r4mp
1629%else
1630    mov        src_bakq, r9mp
1631    lea            srcq, [src_bakq+wq*2]
1632%endif
1633    btc       dword r8m, 2
1634    jc .next_blk
1635    add          offxyd, 16
1636    test      dword r8m, 2
1637    jz .loop_x_odd
1638%if ARCH_X86_32
1639    add dword [rsp+8*mmsize+1*gprsize], 16
1640%else
1641    add            r12d, 16                 ; top_offxy += 16
1642%endif
1643    jmp .loop_x_odd_v_overlap
1644
1645.next_blk:
1646    test      dword r8m, 1
1647    jz .loop_x
1648
1649    ; r8m = sbym
1650    test      dword r8m, 2
1651    jnz .loop_x_hv_overlap
1652
1653    ; horizontal overlap (without vertical overlap)
1654.loop_x_h_overlap:
1655%if ARCH_X86_32
1656    add          offxyd, 16
1657    mov [rsp+8*mmsize+0*gprsize], offxyd
1658    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1659    mov            seed, r3m
1660%endif
1661
1662    mov             r6d, seed
1663    or             seed, 0xEFF4
1664    shr             r6d, 1
1665    test           seeb, seeh
1666    lea            seed, [r6+0x8000]
1667    cmovp          seed, r6d                ; updated seed
1668
1669%if ARCH_X86_32
1670    mov             r3m, seed
1671
1672    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
1673
1674    mov           offxd, offyd
1675%else
1676    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1677                offx, offy, see, src_bak, left_offxy
1678
1679    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
1680
1681    mov           offyd, seed
1682    mov           offxd, seed
1683%endif
1684    ror           offyd, 8
1685    shr           offxd, 12
1686    and           offyd, 0xf
1687    imul          offyd, 164
1688    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1689
1690%if ARCH_X86_32
1691    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1692%else
1693    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1694                h, offxy, see, src_bak, left_offxy
1695%endif
1696
1697    mov              hd, dword r7m
1698    mov      grain_lutq, grain_lutmp
1699.loop_y_h_overlap:
1700    ; grain = grain_lut[offy+y][offx+x]
1701    movu             m5, [grain_lutq+offxyq*2]
1702%if ARCH_X86_32
1703    mov              r5, [rsp+8*mmsize+0*gprsize]
1704    movd             m4, [grain_lutq+r5*2]
1705%else
1706    movd             m4, [grain_lutq+left_offxyq*2]
1707%endif
1708    punpcklwd        m4, m5
1709    pmaddwd          m4, m6
1710    paddd            m4, m14
1711    psrad            m4, 5
1712    packssdw         m4, m4
1713    pminsw           m4, m15
1714    pmaxsw           m4, m9
1715    shufps           m4, m5, q3210
1716
1717    ; src
1718    pand             m0, m10, [srcq+ 0]
1719    pand             m1, m10, [srcq+16]          ; m0-1: src as word
1720
1721    ; scaling[src]
1722%if ARCH_X86_32
1723    vpgatherdw       m2, m0, scalingq-1, r0, r5, 8, 1, m5
1724    vpgatherdw       m3, m1, scalingq-1, r0, r5, 8, 1, m5
1725%else
1726    vpgatherdw       m2, m0, scalingq-1, r13, r14, 8, 1, m5
1727    vpgatherdw       m3, m1, scalingq-1, r13, r14, 8, 1, m5
1728%endif
1729    REPX   {psrlw x, 8}, m2, m3
1730
1731    ; noise = round2(scaling[src] * grain, scaling_shift)
1732    movu             m5, [grain_lutq+offxyq*2+16]
1733    REPX {pmullw x, m11}, m2, m3
1734    pmulhrsw         m4, m2
1735    pmulhrsw         m5, m3
1736
1737    ; dst = clip_pixel(src, noise)
1738    paddw            m0, m4
1739    paddw            m1, m5
1740    pmaxsw           m0, m13
1741    pmaxsw           m1, m13
1742    pminsw           m0, m12
1743    pminsw           m1, m12
1744    movifnidn      dstq, dstmp
1745    mova [dstq+srcq+ 0], m0
1746    mova [dstq+srcq+16], m1
1747
1748    add            srcq, r2mp
1749    add      grain_lutq, 82*2
1750    dec              hd
1751    jg .loop_y_h_overlap
1752
1753%if ARCH_X86_32
1754    add            r4mp, 16
1755%else
1756    add              wq, 16
1757%endif
1758    jge .end
1759%if ARCH_X86_32
1760    mov            srcq, r9mp
1761    add            srcq, r4mp
1762    add            srcq, r4mp
1763%else
1764    mov        src_bakq, r9mp
1765    lea            srcq, [src_bakq+wq*2]
1766%endif
1767    or        dword r8m, 4
1768    add          offxyd, 16
1769
1770    ; r8m = sbym
1771    test      dword r8m, 2
1772    jz .loop_x_odd
1773%if ARCH_X86_32
1774    add dword [rsp+8*mmsize+1*gprsize], 16
1775%else
1776    add            r12d, 16                 ; top_offxy += 16
1777%endif
1778    jmp .loop_x_odd_v_overlap
1779
1780.end:
1781    RET
1782
1783.vertical_overlap:
1784    or              t0d, 2
1785    mov             r8m, t0d
1786
1787%if ARCH_X86_32
1788    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused
1789%else
1790    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
1791                sby, see
1792%endif
1793
1794    movzx          sbyd, sbyb
1795%if ARCH_X86_32
1796    imul             r4, [fg_dataq+FGData.seed], 0x00010001
1797    DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused
1798%else
1799    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1800%endif
1801    imul            t0d, sbyd, 173 * 0x00010001
1802    imul           sbyd, 37 * 0x01000100
1803    add             t0d, (105 << 16) | 188
1804    add            sbyd, (178 << 24) | (141 << 8)
1805    and             t0d, 0x00ff00ff
1806    and            sbyd, 0xff00ff00
1807    xor            seed, t0d
1808%if ARCH_X86_32
1809    xor            sbyd, seed
1810
1811    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1812
1813    mov             r3m, seed
1814    mov              wq, r4m
1815%else
1816    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1817
1818    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1819                unused1, unused2, see, src_bak
1820%endif
1821
1822    lea        src_bakq, [srcq+wq*2]
1823    mov            r9mp, src_bakq
1824    neg              wq
1825    sub           dstmp, srcq
1826%if ARCH_X86_32
1827    mov             r4m, wq
1828%endif
1829
1830.loop_x_v_overlap:
1831%if ARCH_X86_32
1832    mov              r5, r5m
1833    SPLATD           m7, [base+pw_27_17_17_27]
1834    mov            seed, r3m
1835%else
1836    SPLATD           m7, [pw_27_17_17_27]
1837%endif
1838
1839    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1840    mov             r6d, seed
1841    or             seed, 0xeff4eff4
1842    test           seeb, seeh
1843    setp            t0b                     ; parity of top_seed
1844    shr            seed, 16
1845    shl             t0d, 16
1846    test           seeb, seeh
1847    setp            t0b                     ; parity of cur_seed
1848    or              r6d, 0x00010001
1849    xor             t0d, r6d
1850    mov            seed, t0d
1851    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1852
1853%if ARCH_X86_32
1854    mov             r3m, seed
1855
1856    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1857
1858    mov           offxd, offyd
1859%else
1860    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1861                offx, offy, see, src_bak, unused, top_offxy
1862
1863    mov           offyd, seed
1864    mov           offxd, seed
1865%endif
1866    ror           offyd, 8
1867    ror           offxd, 12
1868    and           offyd, 0xf000f
1869    and           offxd, 0xf000f
1870    imul          offyd, 164
1871    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1872    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1873
1874%if ARCH_X86_32
1875    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
1876%else
1877    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1878                h, offxy, see, src_bak, unused, top_offxy
1879%endif
1880
1881    movzx    top_offxyd, offxyw
1882%if ARCH_X86_32
1883    mov [rsp+8*mmsize+1*gprsize], top_offxyd
1884
1885    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1886%endif
1887    shr          offxyd, 16
1888
1889.loop_x_odd_v_overlap:
1890%if ARCH_X86_32
1891    mov              r5, r5m
1892%endif
1893    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)]
1894    mov              hd, dword r7m
1895    mov      grain_lutq, grain_lutmp
1896.loop_y_v_overlap:
1897    ; grain = grain_lut[offy+y][offx+x]
1898    movu             m3, [grain_lutq+offxyq*2]
1899%if ARCH_X86_32
1900    mov              r5, [rsp+8*mmsize+1*gprsize]
1901    movu             m2, [grain_lutq+r5*2]
1902%else
1903    movu             m2, [grain_lutq+top_offxyq*2]
1904%endif
1905    punpckhwd        m4, m2, m3
1906    punpcklwd        m2, m3
1907    REPX {pmaddwd x, m7}, m4, m2
1908    REPX {paddd   x, m14}, m4, m2
1909    REPX {psrad   x, 5}, m4, m2
1910    packssdw         m2, m4
1911    pminsw           m2, m15
1912    pmaxsw           m2, m9
1913    movu             m4, [grain_lutq+offxyq*2+16]
1914%if ARCH_X86_32
1915    movu             m3, [grain_lutq+r5*2+16]
1916%else
1917    movu             m3, [grain_lutq+top_offxyq*2+16]
1918%endif
1919    punpckhwd        m5, m3, m4
1920    punpcklwd        m3, m4
1921    REPX {pmaddwd x, m7}, m5, m3
1922    REPX {paddd   x, m14}, m5, m3
1923    REPX {psrad   x, 5}, m5, m3
1924    packssdw         m3, m5
1925    pminsw           m3, m15
1926    pmaxsw           m3, m9
1927
1928    ; src
1929    pand             m0, m10, [srcq+ 0]          ; m0-1: src as word
1930    pand             m1, m10, [srcq+16]          ; m0-1: src as word
1931
1932    ; scaling[src]
1933    ; noise = round2(scaling[src] * grain, scaling_shift)
1934%if ARCH_X86_32
1935    vpgatherdw       m4, m0, scalingq-1, r0, r5, 8, 1, m5
1936%else
1937    vpgatherdw       m4, m0, scalingq-1, r11, r13, 8, 1, m5
1938%endif
1939    psrlw            m4, 8
1940    pmullw           m4, m11
1941    pmulhrsw         m4, m2
1942%if ARCH_X86_32
1943    vpgatherdw       m5, m1, scalingq-1, r0, r5, 8, 1, m2
1944%else
1945    vpgatherdw       m5, m1, scalingq-1, r11, r13, 8, 1, m2
1946%endif
1947    psrlw            m5, 8
1948    pmullw           m5, m11
1949    pmulhrsw         m5, m3
1950
1951    ; dst = clip_pixel(src, noise)
1952    paddw            m0, m4
1953    paddw            m1, m5
1954    pmaxsw           m0, m13
1955    pmaxsw           m1, m13
1956    pminsw           m0, m12
1957    pminsw           m1, m12
1958    movifnidn      dstq, dstmp
1959    mova [dstq+srcq+ 0], m0
1960    mova [dstq+srcq+16], m1
1961
1962    add            srcq, r2mp
1963    add      grain_lutq, 82*2
1964    dec              hw
1965    jz .end_y_v_overlap
1966    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1967    ; remaining (up to) 30 lines
1968%if ARCH_X86_32
1969    mov              r5, r5m
1970%endif
1971    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)+4]
1972    xor              hd, 0x10000
1973    test             hd, 0x10000
1974    jnz .loop_y_v_overlap
1975    jmp .loop_y
1976
1977.end_y_v_overlap:
1978%if ARCH_X86_32
1979    add            r4mp, 16
1980%else
1981    add              wq, 16
1982%endif
1983    jge .end_hv
1984%if ARCH_X86_32
1985    mov            srcq, r9mp
1986    add            srcq, r4mp
1987    add            srcq, r4mp
1988%else
1989    mov        src_bakq, r9mp
1990    lea            srcq, [src_bakq+wq*2]
1991%endif
1992    btc       dword r8m, 2
1993    jc .next_blk_v
1994%if ARCH_X86_32
1995    add dword [rsp+8*mmsize+1*gprsize], 16
1996%else
1997    add      top_offxyd, 16
1998%endif
1999    add          offxyd, 16
2000    jmp .loop_x_odd_v_overlap
2001
2002.next_blk_v:
2003    ; since fg_dataq.overlap is guaranteed to be set, we never jump
2004    ; back to .loop_x_v_overlap, and instead always fall-through to
2005    ; h+v overlap
2006
2007.loop_x_hv_overlap:
2008%if ARCH_X86_32
2009    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
2010
2011    mov              r0, [rsp+8*mmsize+1*gprsize]
2012    add              r3, 16
2013    add              r0, 16
2014    mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
2015    mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy
2016
2017    mov            seed, r3m
2018    xor              r0, r0
2019%else
2020    ; we assume from the block above that bits 8-15 of r7d are zero'ed
2021%endif
2022    mov             r6d, seed
2023    or             seed, 0xeff4eff4
2024    test           seeb, seeh
2025    setp            t0b                     ; parity of top_seed
2026    shr            seed, 16
2027    shl             t0d, 16
2028    test           seeb, seeh
2029    setp            t0b                     ; parity of cur_seed
2030    or              r6d, 0x00010001
2031    xor             t0d, r6d
2032    mov            seed, t0d
2033    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2034
2035%if ARCH_X86_32
2036    mov             r3m, seed
2037
2038    DEFINE_ARGS  dst, src, scaling, offy, w, picptr, offx
2039
2040    mov           offxd, offyd
2041%else
2042    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2043                offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
2044
2045    lea  topleft_offxyq, [top_offxyq+16]
2046    lea     left_offxyq, [offyq+16]
2047    mov           offyd, seed
2048    mov           offxd, seed
2049%endif
2050    ror           offyd, 8
2051    ror           offxd, 12
2052    and           offyd, 0xf000f
2053    and           offxd, 0xf000f
2054    imul          offyd, 164
2055    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2056    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
2057
2058%if ARCH_X86_32
2059    DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut
2060%else
2061    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2062                h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
2063%endif
2064
2065    movzx    top_offxyd, offxyw
2066%if ARCH_X86_32
2067    mov [rsp+8*mmsize+1*gprsize], top_offxyd
2068
2069    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2070%endif
2071    shr          offxyd, 16
2072
2073%if ARCH_X86_32
2074    mov              r5, r5m
2075%endif
2076    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)]
2077
2078    movzx            hd, word r7m
2079    mov      grain_lutq, grain_lutmp
2080.loop_y_hv_overlap:
2081    ; grain = grain_lut[offy+y][offx+x]
2082    movu             m2, [grain_lutq+offxyq*2]
2083%if ARCH_X86_32
2084    mov              r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
2085    mov              r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
2086    movu             m4, [grain_lutq+r0*2]
2087    movd             m5, [grain_lutq+r5*2]
2088    mov              r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
2089    movd             m3, [grain_lutq+r5*2]
2090%else
2091    movu             m4, [grain_lutq+top_offxyq*2]
2092    movd             m5, [grain_lutq+left_offxyq*2]
2093    movd             m3, [grain_lutq+topleft_offxyq*2]
2094%endif
2095    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
2096    punpcklwd        m5, m2
2097    punpcklwd        m3, m4
2098    REPX {pmaddwd x, m6}, m5, m3
2099    REPX {paddd   x, m14}, m5, m3
2100    REPX {psrad   x, 5}, m5, m3
2101    packssdw         m5, m3
2102    pminsw           m5, m15
2103    pmaxsw           m5, m9
2104    shufps           m3, m5, m2, q3210
2105    shufps           m5, m4, q3232
2106    ; followed by v interpolation (top | cur -> cur)
2107    movu             m0, [grain_lutq+offxyq*2+16]
2108%if ARCH_X86_32
2109    movu             m1, [grain_lutq+r0*2+16]
2110%else
2111    movu             m1, [grain_lutq+top_offxyq*2+16]
2112%endif
2113    punpcklwd        m2, m5, m3
2114    punpckhwd        m5, m3
2115    punpcklwd        m3, m1, m0
2116    punpckhwd        m1, m0
2117    REPX {pmaddwd x, m7}, m2, m5, m3, m1
2118    REPX {paddd   x, m14}, m2, m5, m3, m1
2119    REPX {psrad   x, 5}, m2, m5, m3, m1
2120    packssdw         m2, m5
2121    packssdw         m3, m1
2122    REPX {pminsw x, m15}, m2, m3
2123    REPX {pmaxsw x, m9}, m2, m3
2124
2125    ; src
2126    pand             m0, m10, [srcq+ 0]
2127    pand             m1, m10, [srcq+16]          ; m0-1: src as word
2128
2129    ; scaling[src]
2130    ; noise = round2(scaling[src] * grain, scaling_shift)
2131%if ARCH_X86_32
2132    vpgatherdw       m4, m0, scalingq-1, r0, r5, 8, 1, m5
2133%else
2134    vpgatherdw       m4, m0, scalingq-1, r14, r10, 8, 1, m5
2135%endif
2136    psrlw            m4, 8
2137    pmullw           m4, m11
2138    pmulhrsw         m2, m4
2139%if ARCH_X86_32
2140    vpgatherdw       m5, m1, scalingq-1, r0, r5, 8, 1, m4
2141%else
2142    vpgatherdw       m5, m1, scalingq-1, r14, r10, 8, 1, m4
2143%endif
2144    psrlw            m5, 8
2145    pmullw           m5, m11
2146    pmulhrsw         m3, m5
2147
2148    ; dst = clip_pixel(src, noise)
2149    paddw            m0, m2
2150    paddw            m1, m3
2151    pmaxsw           m0, m13
2152    pmaxsw           m1, m13
2153    pminsw           m0, m12
2154    pminsw           m1, m12
2155    movifnidn      dstq, dstmp
2156    mova [dstq+srcq+ 0], m0
2157    mova [dstq+srcq+16], m1
2158
2159    add            srcq, r2mp
2160    add      grain_lutq, 82*2
2161    dec              hw
2162    jz .end_y_hv_overlap
2163    ; 2 lines get vertical overlap, then fall back to non-overlap code for
2164    ; remaining (up to) 30 lines
2165%if ARCH_X86_32
2166    mov              r5, r5m
2167%endif
2168    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)+4]
2169    xor              hd, 0x10000
2170    test             hd, 0x10000
2171    jnz .loop_y_hv_overlap
2172    jmp .loop_y_h_overlap
2173
2174.end_y_hv_overlap:
2175    or        dword r8m, 4
2176%if ARCH_X86_32
2177    add            r4mp, 16
2178%else
2179    add              wq, 16
2180%endif
2181    jge .end_hv
2182%if ARCH_X86_32
2183    mov              r5, r5m
2184    add          offxyd, 16
2185    add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16
2186    mov            srcq, r9mp
2187    add            srcq, r4mp
2188    add            srcq, r4mp
2189%else
2190    add          offxyd, 16
2191    add      top_offxyd, 16
2192    mov        src_bakq, r9mp
2193    lea            srcq, [src_bakq+wq*2]
2194%endif
2195    jmp .loop_x_odd_v_overlap
2196
2197.end_hv:
2198    RET
2199%if ARCH_X86_32
2200    DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
2201%endif
2202
2203%macro FGUV_FN 3 ; name, ss_hor, ss_ver
2204INIT_XMM ssse3
2205%if ARCH_X86_32
2206%if STACK_ALIGNMENT < mmsize
2207cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \
2208        tmp, src, scaling, h, fg_data, picptr, unused
2209    mov              r0, r0m
2210    mov              r1, r1m
2211    mov              r2, r2m
2212    mov              r4, r3m
2213    mov              r3, r4m
2214    mov              r5, r5m
2215%define r0m [rsp+8*mmsize+ 3*gprsize]
2216%define r1m [rsp+8*mmsize+ 4*gprsize]
2217%define r2m [rsp+8*mmsize+ 5*gprsize]
2218%define r3m [rsp+8*mmsize+ 6*gprsize]
2219%define r4m [rsp+8*mmsize+ 7*gprsize]
2220%define r5m [rsp+8*mmsize+ 8*gprsize]
2221    mov             r0m, r0
2222    mov             r2m, r2
2223    mov             r4m, r3
2224    mov             r5m, r5
2225
2226    mov              r0, r6m
2227    mov              r2, r7m
2228    mov              r3, r8m
2229    mov              r5, r9m
2230%define r6m [rsp+8*mmsize+ 9*gprsize]
2231%define r7m [rsp+8*mmsize+10*gprsize]
2232%define r8m [rsp+8*mmsize+11*gprsize]
2233%define r9m [rsp+8*mmsize+12*gprsize]
2234    mov             r6m, r0
2235    mov             r7m, r2
2236    mov             r8m, r3
2237    mov             r9m, r5
2238
2239    mov              r2, r10m
2240    mov              r3, r11m
2241    mov              r5, r12m
2242    mov              r0, r13m
2243%define r10m [rsp+8*mmsize+13*gprsize]
2244%define r11m [rsp+8*mmsize+14*gprsize]
2245%define r12m [rsp+8*mmsize+15*gprsize]
2246    mov            r10m, r2
2247    mov            r11m, r3
2248    mov            r12m, r5
2249
2250    SPLATW           m2, r13m
2251%else
2252cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
2253        tmp, src, scaling, h, fg_data, picptr, unused
2254    mov            srcq, srcm
2255    mov        fg_dataq, r3m
2256%endif
2257    LEA              r5, $$
2258%define base r5-$$
2259
2260    DECLARE_REG_TMP   0, 2, 3
2261%else
2262cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
2263                                      grain_lut, h, sby, luma, lstride, uv_pl, is_id
2264%define base r8-pb_mask
2265    lea              r8, [pb_mask]
2266
2267    DECLARE_REG_TMP   9, 10, 11
2268%endif
2269    mov             r6d, [fg_dataq+FGData.scaling_shift]
2270    SPLATW           m3, [base+mul_bits+r6*2-14]
2271    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
2272%if STACK_ALIGNMENT >= mmsize
2273    mov             t0d, r13m               ; bdmax
2274%endif
2275    sar             t0d, 11                 ; is_12bpc
2276    inc             t0d
2277    mov             t1d, r6d
2278    imul            t1d, t0d
2279    dec             t0d
2280    SPLATW           m5, [base+min+t1*2]
2281    lea             t1d, [t0d*3]
2282    mov             t2d, r12m
2283    inc             t2d
2284    imul            r6d, t2d
2285    add             t1d, r6d
2286    SPLATW           m4, [base+max+t1*2]
2287%if STACK_ALIGNMENT >= mmsize
2288    SPLATW           m2, r13m
2289%endif
2290
2291    SCRATCH           2, 10, 2
2292    SCRATCH           3, 11, 3
2293    SCRATCH           4, 12, 4
2294    SCRATCH           5, 13, 5
2295
2296%define mzero m7
2297
2298%if %3
2299    SPLATD           m2, [base+pw_23_22]
2300%endif
2301
2302%if ARCH_X86_32
2303    mov        scalingq, r5m
2304    mov             r5m, r5
2305%else
2306    mov           r13mp, strideq
2307%endif
2308
2309    pcmpeqw          m0, m0
2310    psraw            m1, m10, 1
2311    pxor             m0, m1
2312
2313    SCRATCH           0,  8, 0
2314    SCRATCH           1,  9, 1
2315
2316    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
2317    jne .csfl
2318
2319%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v
2320%if ARCH_X86_32
2321    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2322
2323    DECLARE_REG_TMP    0
2324%else
2325    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
2326
2327    DECLARE_REG_TMP    9
2328%endif
2329
2330%if %1
2331    mov             r6d, r11m
2332    SPLATW           m0, [fg_dataq+FGData.uv_mult+r6*4]
2333    SPLATW           m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
2334    punpcklwd        m6, m1, m0
2335    SPLATW           m5, [fg_dataq+FGData.uv_offset+r6*4]
2336    SPLATD           m7, [base+pw_4+t0*4]
2337    pmullw           m5, m7
2338%else
2339    SPLATD           m6, [base+pd_16]
2340%if %2
2341    mova             m5, [base+pw_23_22]
2342%else
2343    mova             m5, [base+pw_27_17_17_27]
2344%endif
2345%endif
2346
2347    SCRATCH           6, 14, 6
2348    SCRATCH           5, 15, 7
2349
2350%if ARCH_X86_32
2351    DECLARE_REG_TMP   0
2352%else
2353    DECLARE_REG_TMP   7
2354%endif
2355
2356    mov            sbyd, r8m
2357    mov             t0d, [fg_dataq+FGData.overlap_flag]
2358    test            t0d, t0d
2359    jz %%no_vertical_overlap
2360    test           sbyd, sbyd
2361    jnz %%vertical_overlap
2362
2363%%no_vertical_overlap:
2364    mov             r8m, t0d
2365%if ARCH_X86_32
2366    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
2367    imul           seed, (173 << 24) | 37
2368%else
2369    imul           seed, sbyd, (173 << 24) | 37
2370%endif
2371    add            seed, (105 << 24) | 178
2372    rol            seed, 8
2373    movzx          seed, seew
2374    xor            seed, [fg_dataq+FGData.seed]
2375%if ARCH_X86_32
2376    mov             r3m, seed
2377
2378    DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
2379
2380    mov            dstq, r0mp
2381    mov           lumaq, r9mp
2382    mov              wq, r4m
2383    lea              r3, [srcq+wq*2]
2384    mov            r1mp, r3
2385    lea              r3, [dstq+wq*2]
2386    mov           r11mp, r3
2387    lea              r3, [lumaq+wq*(2<<%2)]
2388    mov           r12mp, r3
2389%if %3
2390    shl           r10mp, 1
2391%endif
2392%else
2393    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2394                unused2, unused3, see, unused4, unused5, unused6, luma, lstride
2395
2396    mov        lstrideq, r10mp
2397%if %3
2398    add        lstrideq, lstrideq
2399%endif
2400    mov           lumaq, r9mp
2401    lea             r10, [srcq+wq*2]
2402    lea             r11, [dstq+wq*2]
2403    lea             r12, [lumaq+wq*(2<<%2)]
2404    mov           r10mp, r10
2405    mov           r11mp, r11
2406    mov           r12mp, r12
2407%endif
2408    neg              wq
2409%if ARCH_X86_32
2410    mov           r4mp, wq
2411%endif
2412
2413%%loop_x:
2414%if ARCH_X86_32
2415    mov            seed, r3m
2416%endif
2417
2418    mov             r6d, seed
2419    or             seed, 0xEFF4
2420    shr             r6d, 1
2421    test           seeb, seeh
2422    lea            seed, [r6+0x8000]
2423    cmovp          seed, r6d               ; updated seed
2424
2425%if ARCH_X86_32
2426    mov             r3m, seed
2427
2428    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2429
2430    mov           offxd, offyd
2431%else
2432    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2433                offx, offy, see, unused1, unused2, unused3, luma, lstride
2434
2435    mov           offxd, seed
2436    mov           offyd, seed
2437%endif
2438    ror           offyd, 8
2439    shr           offxd, 12
2440    and           offyd, 0xf
2441    imul          offyd, 164>>%3
2442    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
2443
2444%if ARCH_X86_32
2445    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2446%else
2447    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2448                h, offxy, see, unused1, unused2, unused3, luma, lstride
2449%endif
2450
2451%if %2 == 0
2452%%loop_x_odd:
2453%endif
2454    mov              hd, r7m
2455    mov      grain_lutq, grain_lutmp
2456%%loop_y:
2457    ; src
2458    mova             m0, [srcq]
2459    mova             m1, [srcq+16]          ; m0-1: src as word
2460
2461    ; luma_src
2462    pxor          mzero, mzero
2463%if ARCH_X86_32
2464    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
2465
2466    mov           lumaq, r9m
2467%endif
2468    mova             m4, [lumaq+ 0]
2469    mova             m6, [lumaq+(16<<%2)]
2470%if %2
2471    phaddw           m4, [lumaq+16]
2472    phaddw           m6, [lumaq+48]
2473%endif
2474%if ARCH_X86_32
2475    add           lumaq, r10mp
2476    mov             r9m, lumaq
2477%endif
2478%if %2
2479    pavgw            m4, mzero
2480    pavgw            m6, mzero
2481%endif
2482
2483%if %1
2484    punpckhwd        m3, m4, m0
2485    punpcklwd        m4, m0
2486    punpckhwd        m5, m6, m1
2487    punpcklwd        m6, m1                 ; { luma, chroma }
2488    REPX {pmaddwd x, m14}, m3, m4, m5, m6
2489    REPX {psrad   x, 6}, m3, m4, m5, m6
2490    packssdw         m4, m3
2491    packssdw         m6, m5
2492    REPX {paddw x, m15}, m4, m6
2493    REPX {pmaxsw x, mzero}, m4, m6
2494    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
2495%else
2496    REPX  {pand x, m10}, m4, m6
2497%endif
2498
2499    ; scaling[luma_src]
2500%if ARCH_X86_32
2501    vpgatherdw       m3, m4, scalingq-1, r0, r5, 8, 1
2502    vpgatherdw       m5, m6, scalingq-1, r0, r5, 8, 1
2503%else
2504    vpgatherdw       m3, m4, scalingq-1, r10, r12, 8, 1
2505    vpgatherdw       m5, m6, scalingq-1, r10, r12, 8, 1
2506%endif
2507    REPX   {psrlw x, 8}, m3, m5
2508
2509    ; grain = grain_lut[offy+y][offx+x]
2510    movu             m4, [grain_lutq+offxyq*2]
2511    movu             m6, [grain_lutq+offxyq*2+16]
2512
2513    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2514    REPX {pmullw x, m11}, m3, m5
2515    pmulhrsw         m4, m3
2516    pmulhrsw         m6, m5
2517
2518    ; dst = clip_pixel(src, noise)
2519    paddw            m0, m4
2520    paddw            m1, m6
2521    pmaxsw           m0, m13
2522    pmaxsw           m1, m13
2523    pminsw           m0, m12
2524    pminsw           m1, m12
2525    movifnidn      dstq, dstmp
2526    mova      [dstq+ 0], m0
2527    mova      [dstq+16], m1
2528
2529%if ARCH_X86_32
2530    add            srcq, r2mp
2531    add            dstq, r2mp
2532    mov           dstmp, dstq
2533%else
2534    add            srcq, r13mp
2535    add            dstq, r13mp
2536    add           lumaq, lstrideq
2537%endif
2538    add      grain_lutq, 82*2
2539    dec              hd
2540    jg %%loop_y
2541
2542%if ARCH_X86_32
2543    DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma
2544
2545    mov              wq, r4mp
2546%endif
2547    add              wq, 16
2548    jge %%end
2549%if ARCH_X86_32
2550    mov            srcq, r1mp
2551%else
2552    mov            srcq, r10mp
2553%endif
2554    mov            dstq, r11mp
2555    mov           lumaq, r12mp
2556    lea            srcq, [srcq+wq*2]
2557    lea            dstq, [dstq+wq*2]
2558    lea           lumaq, [lumaq+wq*(2<<%2)]
2559%if ARCH_X86_32
2560    mov             r0m, dstq
2561    mov             r9m, lumaq
2562    mov             r4m, wq
2563%endif
2564%if %2 == 0
2565    btc       dword r8m, 2
2566    jc %%next_blk
2567    add          offxyd, 16
2568    test      dword r8m, 2
2569    jz %%loop_x_odd
2570%if ARCH_X86_32
2571    add dword [rsp+8*mmsize+1*gprsize], 16
2572%else
2573    add            r11d, 16
2574%endif
2575    jmp %%loop_x_odd_v_overlap
2576%%next_blk:
2577%endif
2578    test      dword r8m, 1
2579    je %%loop_x
2580
2581    ; r8m = sbym
2582    test      dword r8m, 2
2583    jnz %%loop_x_hv_overlap
2584
2585    ; horizontal overlap (without vertical overlap)
2586%%loop_x_h_overlap:
2587%if ARCH_X86_32
2588    add          offxyd, 16
2589    mov [rsp+8*mmsize+0*gprsize], offxyd
2590
2591    DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
2592
2593    mov            seed, r3m
2594%endif
2595    mov             r6d, seed
2596    or             seed, 0xEFF4
2597    shr             r6d, 1
2598    test           seeb, seeh
2599    lea            seed, [r6+0x8000]
2600    cmovp          seed, r6d               ; updated seed
2601
2602%if ARCH_X86_32
2603    mov             r3m, seed
2604
2605    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2606
2607    mov           offxd, offyd
2608%else
2609    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2610                offx, offy, see, left_offxy, unused1, unused2, luma, lstride
2611
2612    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
2613    mov           offxd, seed
2614    mov           offyd, seed
2615%endif
2616    ror           offyd, 8
2617    shr           offxd, 12
2618    and           offyd, 0xf
2619    imul          offyd, 164>>%3
2620    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
2621
2622%if ARCH_X86_32
2623    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2624%else
2625    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2626                h, offxy, see, left_offxy, unused1, unused2, luma, lstride
2627%endif
2628
2629    mov              hd, r7m
2630    mov      grain_lutq, grain_lutmp
2631%%loop_y_h_overlap:
2632    mova             m0, [srcq]
2633    mova             m1, [srcq+16]
2634
2635    ; luma_src
2636    pxor          mzero, mzero
2637%if ARCH_X86_32
2638    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
2639    mov           lumaq, r9m
2640%endif
2641    mova             m4, [lumaq+ 0]
2642    mova             m6, [lumaq+(16<<%2)]
2643%if %2
2644    phaddw           m4, [lumaq+16]
2645    phaddw           m6, [lumaq+48]
2646%endif
2647%if ARCH_X86_32
2648    add           lumaq, r10mp
2649    mov             r9m, lumaq
2650%endif
2651%if %2
2652    pavgw            m4, mzero
2653    pavgw            m6, mzero
2654%endif
2655
2656%if %1
2657    punpckhwd        m3, m4, m0
2658    punpcklwd        m4, m0
2659    punpckhwd        m5, m6, m1
2660    punpcklwd        m6, m1                 ; { luma, chroma }
2661    REPX {pmaddwd x, m14}, m3, m4, m5, m6
2662    REPX {psrad   x, 6}, m3, m4, m5, m6
2663    packssdw         m4, m3
2664    packssdw         m6, m5
2665    REPX {paddw x, m15}, m4, m6
2666    REPX {pmaxsw x, mzero}, m4, m6
2667    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
2668%else
2669    REPX  {pand x, m10}, m4, m6
2670%endif
2671
2672    ; grain = grain_lut[offy+y][offx+x]
2673    movu             m7, [grain_lutq+offxyq*2]
2674%if ARCH_X86_32
2675    mov              r5, [rsp+8*mmsize+0*gprsize]
2676    movd             m5, [grain_lutq+r5*2]
2677%else
2678    movd             m5, [grain_lutq+left_offxyq*2+ 0]
2679%endif
2680    punpcklwd        m5, m7                ; {left0, cur0}
2681%if %1
2682%if ARCH_X86_32
2683    mov              r5, r5m
2684%endif
2685%if %2
2686    pmaddwd          m5, [PIC_ptr(pw_23_22)]
2687%else
2688    pmaddwd          m5, [PIC_ptr(pw_27_17_17_27)]
2689%endif
2690    paddd            m5, [PIC_ptr(pd_16)]
2691%else
2692    pmaddwd          m5, m15
2693    paddd            m5, m14
2694%endif
2695    psrad            m5, 5
2696    packssdw         m5, m5
2697    pmaxsw           m5, m8
2698    pminsw           m5, m9
2699    shufps           m5, m7, q3210
2700    movu             m3, [grain_lutq+offxyq*2+16]
2701
2702    ; scaling[luma_src]
2703%if ARCH_X86_32
2704    vpgatherdw       m7, m4, scalingq-1, r0, r5, 8, 1
2705    vpgatherdw       m4, m6, scalingq-1, r0, r5, 8, 1
2706%else
2707    vpgatherdw       m7, m4, scalingq-1, r2, r12, 8, 1
2708    vpgatherdw       m4, m6, scalingq-1, r2, r12, 8, 1
2709%endif
2710    REPX   {psrlw x, 8}, m7, m4
2711
2712    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2713    REPX {pmullw x, m11}, m7, m4
2714    pmulhrsw         m5, m7
2715    pmulhrsw         m3, m4
2716
2717    ; dst = clip_pixel(src, noise)
2718    paddw            m0, m5
2719    paddw            m1, m3
2720    pmaxsw           m0, m13
2721    pmaxsw           m1, m13
2722    pminsw           m0, m12
2723    pminsw           m1, m12
2724    movifnidn      dstq, dstmp
2725    mova      [dstq+ 0], m0
2726    mova      [dstq+16], m1
2727
2728%if ARCH_X86_32
2729    add            srcq, r2mp
2730    add            dstq, r2mp
2731    mov           dstmp, dstq
2732%else
2733    add            srcq, r13mp
2734    add            dstq, r13mp
2735    add           lumaq, lstrideq
2736%endif
2737    add      grain_lutq, 82*2
2738    dec              hd
2739    jg %%loop_y_h_overlap
2740
2741%if ARCH_X86_32
2742    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
2743    mov              wq, r4mp
2744%endif
2745    add              wq, 16
2746    jge %%end
2747%if ARCH_X86_32
2748    mov            srcq, r1mp
2749%else
2750    mov            srcq, r10mp
2751%endif
2752    mov            dstq, r11mp
2753    mov           lumaq, r12mp
2754    lea            srcq, [srcq+wq*2]
2755    lea            dstq, [dstq+wq*2]
2756    lea           lumaq, [lumaq+wq*(2<<%2)]
2757%if ARCH_X86_32
2758    mov            r0mp, dstq
2759    mov            r9mp, lumaq
2760    mov             r4m, wq
2761%endif
2762
2763%if %2
2764    ; r8m = sbym
2765    test      dword r8m, 2
2766    jne %%loop_x_hv_overlap
2767    jmp %%loop_x_h_overlap
2768%else
2769    or        dword r8m, 4
2770    add          offxyd, 16
2771
2772    ; r8m = sbym
2773    test      dword r8m, 2
2774    jz %%loop_x_odd
2775%if ARCH_X86_32
2776    add dword [rsp+8*mmsize+1*gprsize], 16
2777%else
2778    add            r11d, 16                 ; top_offxy += 16
2779%endif
2780    jmp %%loop_x_odd_v_overlap
2781%endif
2782
2783%%end:
2784    RET
2785
2786%%vertical_overlap:
2787    or              t0d, 2
2788    mov             r8m, t0d
2789
2790%if ARCH_X86_32
2791    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2792%else
2793    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
2794                sby, see, unused1, unused2, unused3, lstride
2795%endif
2796
2797    movzx          sbyd, sbyb
2798%if ARCH_X86_32
2799    imul             r4, [fg_dataq+FGData.seed], 0x00010001
2800
2801    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
2802%else
2803    imul           seed, [fg_dataq+FGData.seed], 0x00010001
2804%endif
2805    imul            t0d, sbyd, 173 * 0x00010001
2806    imul           sbyd, 37 * 0x01000100
2807    add             t0d, (105 << 16) | 188
2808    add            sbyd, (178 << 24) | (141 << 8)
2809    and             t0d, 0x00ff00ff
2810    and            sbyd, 0xff00ff00
2811    xor            seed, t0d
2812%if ARCH_X86_32
2813    xor            sbyd, seed
2814
2815    DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
2816
2817    mov             r3m, seed
2818    mov            dstq, r0mp
2819    mov           lumaq, r9mp
2820    mov              wq, r4m
2821    lea              r3, [srcq+wq*2]
2822    mov            r1mp, r3
2823    lea              r3, [dstq+wq*2]
2824    mov           r11mp, r3
2825    lea              r3, [lumaq+wq*(2<<%2)]
2826    mov           r12mp, r3
2827%if %3
2828    shl           r10mp, 1
2829%endif
2830%else
2831    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
2832
2833    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2834                unused1, unused2, see, unused3, unused4, unused5, luma, lstride
2835
2836    mov        lstrideq, r10mp
2837%if %3
2838    add        lstrideq, lstrideq
2839%endif
2840    mov           lumaq, r9mp
2841    lea             r10, [srcq+wq*2]
2842    lea             r11, [dstq+wq*2]
2843    lea             r12, [lumaq+wq*(2<<%2)]
2844    mov           r10mp, r10
2845    mov           r11mp, r11
2846    mov           r12mp, r12
2847%endif
2848    neg              wq
2849%if ARCH_X86_32
2850    mov             r4m, wq
2851%endif
2852
2853%%loop_x_v_overlap:
2854%if ARCH_X86_32
2855    mov            seed, r3m
2856    xor             t0d, t0d
2857%else
2858    ; we assume from the block above that bits 8-15 of r7d are zero'ed
2859%endif
2860    mov             r6d, seed
2861    or             seed, 0xeff4eff4
2862    test           seeb, seeh
2863    setp            t0b                     ; parity of top_seed
2864    shr            seed, 16
2865    shl             t0d, 16
2866    test           seeb, seeh
2867    setp            t0b                     ; parity of cur_seed
2868    or              r6d, 0x00010001
2869    xor             t0d, r6d
2870    mov            seed, t0d
2871    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2872%if ARCH_X86_32
2873    mov             r3m, seed
2874
2875    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2876
2877    mov           offxd, offyd
2878%else
2879    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2880                offx, offy, see, unused1, top_offxy, unused2, luma, lstride
2881
2882    mov           offyd, seed
2883    mov           offxd, seed
2884%endif
2885    ror           offyd, 8
2886    ror           offxd, 12
2887    and           offyd, 0xf000f
2888    and           offxd, 0xf000f
2889    imul          offyd, 164>>%3
2890    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2891    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2892
2893%if ARCH_X86_32
2894    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
2895%else
2896    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2897                h, offxy, see, unused1, top_offxy, unused2, luma, lstride
2898%endif
2899    movzx    top_offxyd, offxyw
2900%if ARCH_X86_32
2901    mov [rsp+8*mmsize+1*gprsize], top_offxyd
2902    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2903%endif
2904    shr          offxyd, 16
2905
2906%if %2 == 0
2907%%loop_x_odd_v_overlap:
2908%endif
2909%if %3 == 0
2910%if ARCH_X86_32
2911    mov              r5, r5m
2912%endif
2913    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)]
2914%endif
2915
2916    mov              hd, r7m
2917    mov      grain_lutq, grain_lutmp
2918%%loop_y_v_overlap:
2919    ; grain = grain_lut[offy+y][offx+x]
2920    movu             m3, [grain_lutq+offxyq*2]
2921%if ARCH_X86_32
2922    mov              r0, [rsp+mmsize*8+gprsize*1] ; top_offxy
2923    movu             m5, [grain_lutq+r0*2]
2924%else
2925    movu             m5, [grain_lutq+top_offxyq*2]
2926%endif
2927    punpckhwd        m7, m5, m3
2928    punpcklwd        m5, m3                 ; {top/cur interleaved}
2929    REPX {pmaddwd x, m2}, m7, m5
2930%if %1
2931%if ARCH_X86_32
2932    mov              r5, r5m
2933%endif
2934    REPX  {paddd x, [PIC_ptr(pd_16)]}, m7, m5
2935%else
2936    REPX  {paddd x, m14}, m7, m5
2937%endif
2938    REPX   {psrad x, 5}, m7, m5
2939    packssdw         m3, m5, m7
2940    pmaxsw           m3, m8
2941    pminsw           m3, m9
2942
2943    ; grain = grain_lut[offy+y][offx+x]
2944    movu             m4, [grain_lutq+offxyq*2+16]
2945%if ARCH_X86_32
2946    movu             m5, [grain_lutq+r0*2+16]
2947%else
2948    movu             m5, [grain_lutq+top_offxyq*2+16]
2949%endif
2950    punpckhwd        m7, m5, m4
2951    punpcklwd        m5, m4                 ; {top/cur interleaved}
2952    REPX {pmaddwd x, m2}, m7, m5
2953%if %1
2954    REPX  {paddd x, [PIC_ptr(pd_16)]}, m7, m5
2955%else
2956    REPX  {paddd x, m14}, m7, m5
2957%endif
2958    REPX   {psrad x, 5}, m7, m5
2959    packssdw         m4, m5, m7
2960    pmaxsw           m4, m8
2961    pminsw           m4, m9
2962
2963    ; src
2964    mova             m0, [srcq]
2965    mova             m1, [srcq+16]
2966
2967    ; luma_src
2968    pxor          mzero, mzero
2969%if ARCH_X86_32
2970    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
2971
2972    mov           lumaq, r9mp
2973%endif
2974    mova             m5, [lumaq+ 0]
2975    mova             m6, [lumaq+(16<<%2)]
2976%if %2
2977    phaddw           m5, [lumaq+16]
2978    phaddw           m6, [lumaq+48]
2979%endif
2980%if ARCH_X86_32
2981    add           lumaq, r10mp
2982    mov            r9mp, lumaq
2983%endif
2984%if %2
2985    pavgw            m5, mzero
2986    pavgw            m6, mzero
2987%endif
2988
2989%if %1
2990    punpckhwd        m7, m5, m0
2991    punpcklwd        m5, m0
2992    REPX {pmaddwd x, m14}, m7, m5
2993    REPX {psrad   x, 6}, m7, m5
2994    packssdw         m5, m7
2995    punpckhwd        m7, m6, m1
2996    punpcklwd        m6, m1                 ; { luma, chroma }
2997    REPX {pmaddwd x, m14}, m7, m6
2998    REPX {psrad   x, 6}, m7, m6
2999    packssdw         m6, m7
3000    pxor          mzero, mzero
3001    REPX {paddw x, m15}, m5, m6
3002    REPX {pmaxsw x, mzero}, m5, m6
3003    REPX {pminsw x, m10}, m5, m6            ; clip_pixel()
3004%else
3005    REPX  {pand x, m10}, m5, m6
3006%endif
3007
3008    ; scaling[luma_src]
3009%if ARCH_X86_32
3010    vpgatherdw       m7, m5, scalingq-1, r0, r5, 8, 1
3011    vpgatherdw       m5, m6, scalingq-1, r0, r5, 8, 1
3012%else
3013    vpgatherdw       m7, m5, scalingq-1, r10, r12, 8, 1
3014    vpgatherdw       m5, m6, scalingq-1, r10, r12, 8, 1
3015%endif
3016    REPX   {psrlw x, 8}, m7, m5
3017
3018    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
3019    REPX {pmullw x, m11}, m7, m5
3020    pmulhrsw         m3, m7
3021    pmulhrsw         m4, m5
3022
3023    ; dst = clip_pixel(src, noise)
3024    paddw            m0, m3
3025    paddw            m1, m4
3026    pmaxsw           m0, m13
3027    pmaxsw           m1, m13
3028    pminsw           m0, m12
3029    pminsw           m1, m12
3030    movifnidn      dstq, dstmp
3031    mova      [dstq+ 0], m0
3032    mova      [dstq+16], m1
3033
3034    dec              hw
3035    jle %%end_y_v_overlap
3036%if ARCH_X86_32
3037    add            srcq, r2mp
3038    add            dstq, r2mp
3039    mov           dstmp, dstq
3040%else
3041    add            srcq, r13mp
3042    add            dstq, r13mp
3043    add           lumaq, lstrideq
3044%endif
3045    add      grain_lutq, 82*2
3046%if %3
3047    jmp %%loop_y
3048%else
3049    btc              hd, 16
3050    jc %%loop_y
3051%if ARCH_X86_32
3052    mov              r5, r5m
3053%endif
3054    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)+4]
3055    jmp %%loop_y_v_overlap
3056%endif
3057
3058%%end_y_v_overlap:
3059%if ARCH_X86_32
3060    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
3061
3062    mov              wq, r4m
3063%endif
3064    add              wq, 16
3065    jge %%end_hv
3066%if ARCH_X86_32
3067    mov            srcq, r1mp
3068%else
3069    mov            srcq, r10mp
3070%endif
3071    mov            dstq, r11mp
3072    mov           lumaq, r12mp
3073    lea            srcq, [srcq+wq*2]
3074    lea            dstq, [dstq+wq*2]
3075    lea           lumaq, [lumaq+wq*(2<<%2)]
3076%if ARCH_X86_32
3077    mov            r0mp, dstq
3078    mov            r9mp, lumaq
3079    mov             r4m, wq
3080%endif
3081
3082%if %2
3083    ; since fg_dataq.overlap is guaranteed to be set, we never jump
3084    ; back to .loop_x_v_overlap, and instead always fall-through to
3085    ; h+v overlap
3086%else
3087    btc       dword r8m, 2
3088    jc %%loop_x_hv_overlap
3089    add          offxyd, 16
3090%if ARCH_X86_32
3091    add dword [rsp+8*mmsize+1*gprsize], 16
3092%else
3093    add            r11d, 16
3094%endif
3095    jmp %%loop_x_odd_v_overlap
3096%endif
3097
3098%%loop_x_hv_overlap:
3099%if ARCH_X86_32
3100    DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut
3101
3102    mov             t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy
3103    add          offxyd, 16
3104    add             t0d, 16
3105    mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd
3106    mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd
3107
3108    DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
3109
3110    mov            seed, r3m
3111    xor             t0d, t0d
3112%else
3113    ; we assume from the block above that bits 8-15 of r7d are zero'ed
3114%endif
3115    mov             r6d, seed
3116    or             seed, 0xeff4eff4
3117    test           seeb, seeh
3118    setp            t0b                     ; parity of top_seed
3119    shr            seed, 16
3120    shl             t0d, 16
3121    test           seeb, seeh
3122    setp            t0b                     ; parity of cur_seed
3123    or              r6d, 0x00010001
3124    xor             t0d, r6d
3125    mov            seed, t0d
3126    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
3127%if ARCH_X86_32
3128    mov             r3m, seed
3129
3130    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
3131
3132    mov           offxd, offyd
3133%else
3134    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
3135                offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
3136
3137    lea  topleft_offxyq, [top_offxyq+16]
3138    lea     left_offxyq, [offyq+16]
3139    mov           offyd, seed
3140    mov           offxd, seed
3141%endif
3142    ror           offyd, 8
3143    ror           offxd, 12
3144    and           offyd, 0xf000f
3145    and           offxd, 0xf000f
3146    imul          offyd, 164>>%3
3147    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
3148    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
3149
3150%if ARCH_X86_32
3151    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy
3152%else
3153    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
3154                h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
3155%endif
3156    movzx    top_offxyd, offxyw
3157%if ARCH_X86_32
3158    mov [rsp+8*mmsize+1*gprsize], top_offxyd
3159
3160    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
3161%endif
3162    shr          offxyd, 16
3163
3164%if %3 == 0
3165%if ARCH_X86_32
3166    mov              r5, r5m
3167%endif
3168    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)]
3169%endif
3170
3171    mov              hd, r7m
3172    mov      grain_lutq, grain_lutmp
3173%%loop_y_hv_overlap:
3174    ; grain = grain_lut[offy+y][offx+x]
3175%if ARCH_X86_32
3176    mov              r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
3177    mov              r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
3178    movd             m5, [grain_lutq+r5*2]
3179%else
3180    movd             m5, [grain_lutq+left_offxyq*2]
3181%endif
3182    movu             m7, [grain_lutq+offxyq*2]
3183%if ARCH_X86_32
3184    mov              r5, [rsp+8*mmsize+2*gprsize]
3185    movu             m4, [grain_lutq+r0*2]
3186%if %2
3187    pinsrw           m5, [grain_lutq+r5*2], 2
3188%else
3189    movd             m3, [grain_lutq+r5*2]
3190%endif
3191%else
3192    movu             m4, [grain_lutq+top_offxyq*2]
3193%if %2
3194    pinsrw           m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left }
3195%else
3196    movd             m3, [grain_lutq+topleft_offxyq*2]
3197%endif
3198%endif
3199%if %2 == 0
3200    punpckldq        m5, m3
3201%endif
3202    punpckldq        m3, m7, m4             ; { cur0/1,top0/1,cur2/3,top2/3 }
3203    punpcklwd        m5, m3                 ; { left/cur0,_/cur1,topleft/top0,_/top1 }
3204%if %1
3205%if ARCH_X86_32
3206    mov              r5, r5m
3207%endif
3208%if %2
3209    movddup          m0, [PIC_ptr(pw_23_22)]
3210%else
3211    movddup          m0, [PIC_ptr(pw_27_17_17_27)]
3212%endif
3213%else
3214    pshufd           m0, m15, q1010
3215%endif
3216    pmaddwd          m5, m0
3217%if %1
3218    paddd            m5, [PIC_ptr(pd_16)]
3219%else
3220    paddd            m5, m14
3221%endif
3222    psrad            m5, 5
3223    packssdw         m5, m5
3224    pmaxsw           m5, m8
3225    pminsw           m5, m9
3226    shufps           m5, m3, q3210          ; cur0/1,top0/1,cur2/3,top2/3
3227    shufps           m3, m5, m7, q3220      ; cur0-7 post-h_filter
3228    shufps           m5, m4, q3231          ; top0-7 post-h_filter
3229
3230    punpckhwd        m7, m5, m3
3231    punpcklwd        m5, m3                 ; {top/cur interleaved}
3232    REPX {pmaddwd x, m2}, m7, m5
3233%if %1
3234    REPX  {paddd x, [PIC_ptr(pd_16)]}, m5, m7
3235%else
3236    REPX  {paddd x, m14}, m5, m7
3237%endif
3238    REPX   {psrad x, 5}, m5, m7
3239    packssdw         m3, m5, m7
3240    pmaxsw           m3, m8
3241    pminsw           m3, m9
3242
3243    ; right half
3244    movu             m4, [grain_lutq+offxyq*2+16]
3245%if ARCH_X86_32
3246    movu             m0, [grain_lutq+r0*2+16]
3247%else
3248    movu             m0, [grain_lutq+top_offxyq*2+16]
3249%endif
3250    punpckhwd        m1, m0, m4
3251    punpcklwd        m0, m4                 ; {top/cur interleaved}
3252    REPX {pmaddwd x, m2}, m1, m0
3253%if %1
3254    REPX  {paddd x, [PIC_ptr(pd_16)]}, m1, m0
3255%else
3256    REPX  {paddd x, m14}, m1, m0
3257%endif
3258    REPX   {psrad x, 5}, m1, m0
3259    packssdw         m4, m0, m1
3260    pmaxsw           m4, m8
3261    pminsw           m4, m9
3262
3263    ; src
3264    mova             m0, [srcq]
3265    mova             m1, [srcq+16]
3266
3267    ; luma_src
3268    pxor          mzero, mzero
3269%if ARCH_X86_32
3270    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
3271
3272    mov           lumaq, r9mp
3273%endif
3274    mova             m6, [lumaq+ 0]
3275    mova             m5, [lumaq+(16<<%2)]
3276%if %2
3277    phaddw           m6, [lumaq+16]
3278    phaddw           m5, [lumaq+48]
3279%endif
3280%if ARCH_X86_32
3281    add           lumaq, r10mp
3282    mov            r9mp, lumaq
3283%endif
3284%if %2
3285    pavgw            m6, mzero
3286    pavgw            m5, mzero
3287%endif
3288
3289%if %1
3290    punpckhwd        m7, m6, m0
3291    punpcklwd        m6, m0
3292    REPX {pmaddwd x, m14}, m7, m6
3293    REPX {psrad   x, 6}, m7, m6
3294    packssdw         m6, m7
3295    punpckhwd        m7, m5, m1
3296    punpcklwd        m5, m1                 ; { luma, chroma }
3297    REPX {pmaddwd x, m14}, m7, m5
3298    REPX {psrad   x, 6}, m7, m5
3299    packssdw         m5, m7
3300    pxor          mzero, mzero
3301    REPX {paddw x, m15}, m6, m5
3302    REPX {pmaxsw x, mzero}, m6, m5
3303    REPX {pminsw x, m10}, m6, m5            ; clip_pixel()
3304%else
3305    REPX  {pand x, m10}, m6, m5
3306%endif
3307
3308    ; scaling[luma_src]
3309%if ARCH_X86_32
3310    vpgatherdw       m7, m6, scalingq-1, r0, r5, 8, 1
3311    vpgatherdw       m6, m5, scalingq-1, r0, r5, 8, 1
3312%else
3313%if %3 == 0
3314    ; register shortage :)
3315    push            r12
3316%endif
3317    vpgatherdw       m7, m6, scalingq-1, r2, r12, 8, 1
3318    vpgatherdw       m6, m5, scalingq-1, r2, r12, 8, 1
3319%if %3 == 0
3320    pop             r12
3321%endif
3322%endif
3323    REPX   {psrlw x, 8}, m7, m6
3324
3325    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
3326    REPX {pmullw x, m11}, m7, m6
3327    pmulhrsw         m3, m7
3328    pmulhrsw         m4, m6
3329
3330    ; dst = clip_pixel(src, noise)
3331    paddw            m0, m3
3332    paddw            m1, m4
3333    pmaxsw           m0, m13
3334    pmaxsw           m1, m13
3335    pminsw           m0, m12
3336    pminsw           m1, m12
3337    movifnidn      dstq, dstmp
3338    mova      [dstq+ 0], m0
3339    mova      [dstq+16], m1
3340
3341%if ARCH_X86_32
3342    add            srcq, r2mp
3343    add            dstq, r2mp
3344    mov           dstmp, dstq
3345%else
3346    add            srcq, r13mp
3347    add            dstq, r13mp
3348    add           lumaq, lstrideq
3349%endif
3350    add      grain_lutq, 82*2
3351    dec              hw
3352%if %3
3353    jg %%loop_y_h_overlap
3354%else
3355    jle %%end_y_hv_overlap
3356    btc              hd, 16
3357    jc %%loop_y_h_overlap
3358%if ARCH_X86_32
3359    mov              r5, r5m
3360%endif
3361    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)+4]
3362    jmp %%loop_y_hv_overlap
3363%%end_y_hv_overlap:
3364%endif
3365%if ARCH_X86_32
3366    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
3367
3368    mov              wq, r4m
3369%endif
3370    add              wq, 16
3371    jge %%end_hv
3372%if ARCH_X86_32
3373    mov            srcq, r1mp
3374%else
3375    mov            srcq, r10mp
3376%endif
3377    mov            dstq, r11mp
3378    mov           lumaq, r12mp
3379    lea            srcq, [srcq+wq*2]
3380    lea            dstq, [dstq+wq*2]
3381    lea           lumaq, [lumaq+wq*(2<<%2)]
3382%if ARCH_X86_32
3383    mov           dstmp, dstq
3384    mov            r9mp, lumaq
3385    mov             r4m, wq
3386%endif
3387%if %2
3388    jmp %%loop_x_hv_overlap
3389%else
3390    or        dword r8m, 4
3391    add          offxyd, 16
3392%if ARCH_X86_32
3393    add dword [rsp+8*mmsize+1*gprsize], 16
3394%else
3395    add            r11d, 16                 ; top_offxy += 16
3396%endif
3397    jmp %%loop_x_odd_v_overlap
3398%endif
3399
3400%%end_hv:
3401    RET
3402%endmacro
3403
3404    %%FGUV_32x32xN_LOOP 1, %2, %3
3405.csfl:
3406    %%FGUV_32x32xN_LOOP 0, %2, %3
3407
3408%if STACK_ALIGNMENT < mmsize
3409DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3410%endif
3411%endmacro
3412
3413FGUV_FN 420, 1, 1
3414FGUV_FN 422, 1, 0
3415FGUV_FN 444, 0, 0
3416