xref: /aosp_15_r20/external/libdav1d/src/x86/filmgrain_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2019-2022, VideoLAN and dav1d authors
2; Copyright © 2019-2022, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28%include "x86/filmgrain_common.asm"
29
30%if ARCH_X86_64
31
32SECTION_RODATA 32
33pb_mask:       db  0,128,128,  0,128,  0,  0,128,128,  0,  0,128,  0,128,128,  0
34gen_shufE:     db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
35gen_shufA:     db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
36gen_shufB:     db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
37gen_shufC:     db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
38gen_shufD:     db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
39; note: the order of (some of) the following constants matter
40pb_27_17:      times 2 db 27, 17
41byte_blend:            db  0,  0,  0, -1
42pb_27_17_17_27:        db 27, 17, 17, 27,  0, 32,  0, 32
43pb_17_27:      times 2 db 17, 27
44pb_1:          times 4 db 1
45pb_23_22:              db 23, 22,  0, 32,  0, 32,  0, 32
46next_upperbit_mask:    dw 0x100B, 0x2016, 0x402C, 0x8058
47pw_seed_xor:   times 2 dw 0xb524
48               times 2 dw 0x49d8
49fg_min:        times 4 db 0
50               times 4 db 16
51fg_max:        times 4 db 255
52               times 4 db 240
53               times 4 db 235
54pd_m65536:             dd -65536
55pw_8:          times 2 dw 8
56pw_1024:       times 2 dw 1024
57hmul_bits:             dw 32768, 16384,  8192,  4096
58round:                 dw  2048,  1024,   512
59mul_bits:              dw   256,   128,    64,    32,    16
60round_vals:            dw    32,    64,   128,   256,   512
61pw_1:                  dw 1
62
63%macro JMP_TABLE 2-*
64    %1_8bpc_%2_table:
65    %xdefine %%base %1_8bpc_%2_table
66    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
67    %rep %0 - 2
68        dd %%prefix %+ .ar%3 - %%base
69        %rotate 1
70    %endrep
71%endmacro
72
73JMP_TABLE generate_grain_y,      avx2, 0, 1, 2, 3
74JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3
75JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3
76JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
77
78SECTION .text
79
80INIT_YMM avx2
81cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
82%define base r4-generate_grain_y_8bpc_avx2_table
83    lea              r4, [generate_grain_y_8bpc_avx2_table]
84    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
85    mov             r6d, [fg_dataq+FGData.grain_scale_shift]
86    movq            xm1, [base+next_upperbit_mask]
87    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
88    movq            xm4, [base+mul_bits]
89    movq            xm5, [base+hmul_bits]
90    mov              r7, -73*82
91    mova            xm6, [base+pb_mask]
92    sub            bufq, r7
93    vpbroadcastw    xm7, [base+round+r6*2]
94    lea              r6, [gaussian_sequence]
95    movsxd           r5, [r4+r5*4]
96.loop:
97    pand            xm2, xm0, xm1
98    psrlw           xm3, xm2, 10
99    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
100    pmullw          xm2, xm4            ; bits 0x0f00 are set
101    pmulhuw         xm0, xm5
102    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
103    psllq           xm2, xm3, 30
104    por             xm2, xm3
105    psllq           xm3, xm2, 15
106    por             xm2, xm0            ; aggregate each bit into next seed's high bit
107    por             xm3, xm2            ; 4 next output seeds
108    pshuflw         xm0, xm3, q3333
109    psrlw           xm3, 5
110    pand            xm2, xm0, xm1
111    movq             r2, xm3
112    psrlw           xm3, xm2, 10
113    por             xm2, xm3
114    pmullw          xm2, xm4
115    pmulhuw         xm0, xm5
116    movzx           r3d, r2w
117    pshufb          xm3, xm6, xm2
118    psllq           xm2, xm3, 30
119    por             xm2, xm3
120    psllq           xm3, xm2, 15
121    por             xm0, xm2
122    movd            xm2, [r6+r3*2]
123    rorx             r3, r2, 32
124    por             xm3, xm0
125    shr             r2d, 16
126    pinsrw          xm2, [r6+r2*2], 1
127    pshuflw         xm0, xm3, q3333
128    movzx           r2d, r3w
129    psrlw           xm3, 5
130    pinsrw          xm2, [r6+r2*2], 2
131    shr             r3d, 16
132    movq             r2, xm3
133    pinsrw          xm2, [r6+r3*2], 3
134    movzx           r3d, r2w
135    pinsrw          xm2, [r6+r3*2], 4
136    rorx             r3, r2, 32
137    shr             r2d, 16
138    pinsrw          xm2, [r6+r2*2], 5
139    movzx           r2d, r3w
140    pinsrw          xm2, [r6+r2*2], 6
141    shr             r3d, 16
142    pinsrw          xm2, [r6+r3*2], 7
143    pmulhrsw        xm2, xm7
144    packsswb        xm2, xm2
145    movq      [bufq+r7], xm2
146    add              r7, 8
147    jl .loop
148
149    ; auto-regression code
150    add              r5, r4
151    jmp              r5
152
153.ar1:
154    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
155    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
156    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
157    movd            xm5, [fg_dataq+FGData.ar_coeffs_y]
158    mova            xm2, [base+gen_shufC]
159    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
160    pinsrb          xm5, [base+pb_1], 3
161    vpbroadcastw    xm3, [base+round_vals+shiftq*2-12]    ; rnd
162    pmovsxbw        xm5, xm5
163    pshufd          xm4, xm5, q0000
164    pshufd          xm5, xm5, q1111
165    sub            bufq, 82*73-(82*3+79)
166    mov              hd, 70
167    mov            mind, -128
168    mov            maxd, 127
169.y_loop_ar1:
170    mov              xq, -76
171    movsx         val3d, byte [bufq+xq-1]
172.x_loop_ar1:
173    pmovsxbw        xm1, [bufq+xq-82-3]
174    pshufb          xm0, xm1, xm2
175    punpckhwd       xm1, xm3
176    pmaddwd         xm0, xm4
177    pmaddwd         xm1, xm5
178    paddd           xm0, xm1
179.x_loop_ar1_inner:
180    movd          val0d, xm0
181    psrldq          xm0, 4
182    imul          val3d, cf3d
183    add           val3d, val0d
184    movsx         val0d, byte [bufq+xq]
185    sarx          val3d, val3d, shiftd
186    add           val3d, val0d
187    cmp           val3d, maxd
188    cmovns        val3d, maxd
189    cmp           val3d, mind
190    cmovs         val3d, mind
191    mov       [bufq+xq], val3b
192    ; keep val3d in-place as left for next x iteration
193    inc              xq
194    jz .x_loop_ar1_end
195    test             xb, 3
196    jnz .x_loop_ar1_inner
197    jmp .x_loop_ar1
198.x_loop_ar1_end:
199    add            bufq, 82
200    dec              hd
201    jg .y_loop_ar1
202.ar0:
203    RET
204
205.ar2:
206%if WIN64
207    %assign stack_size_padded 168
208    SUB             rsp, stack_size_padded
209    WIN64_PUSH_XMM   16, 8
210%endif
211    DEFINE_ARGS buf, fg_data, h, x
212    mov             r6d, [fg_dataq+FGData.ar_coeff_shift]
213    pmovsxbw        xm7, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
214    movd            xm9, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
215    vpbroadcastd   xm10, [base+round_vals-14+r6*2]
216    movd           xm11, [base+byte_blend+1]
217    pmovsxbw        xm9, xm9
218    pshufd          xm4, xm7, q0000
219    mova           xm12, [base+gen_shufA]
220    pshufd          xm5, xm7, q3333
221    mova           xm13, [base+gen_shufB]
222    pshufd          xm6, xm7, q1111
223    mova           xm14, [base+gen_shufC]
224    pshufd          xm7, xm7, q2222
225    mova           xm15, [base+gen_shufD]
226    pshufd          xm8, xm9, q0000
227    psrld          xm10, 16
228    pshufd          xm9, xm9, q1111
229    sub            bufq, 82*73-(82*3+79)
230    mov              hd, 70
231.y_loop_ar2:
232    mov              xq, -76
233.x_loop_ar2:
234    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
235    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
236    pshufb          xm2, xm0, xm12
237    pmaddwd         xm2, xm4
238    pshufb          xm3, xm1, xm13
239    pmaddwd         xm3, xm5
240    paddd           xm2, xm3
241    pshufb          xm3, xm0, xm14
242    pmaddwd         xm3, xm6
243    punpckhqdq      xm0, xm0
244    punpcklwd       xm0, xm1
245    pmaddwd         xm0, xm7
246    pshufb          xm1, xm15
247    pmaddwd         xm1, xm8
248    paddd           xm2, xm10
249    paddd           xm2, xm3
250    paddd           xm0, xm1
251    paddd           xm2, xm0
252    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
253.x_loop_ar2_inner:
254    pmovsxbw        xm1, xm0
255    pmaddwd         xm3, xm9, xm1
256    psrldq          xm1, 4                  ; y=0,x=0
257    paddd           xm3, xm2
258    psrldq          xm2, 4                  ; shift top to next pixel
259    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
260    ; don't packssdw since we only care about one value
261    paddw           xm3, xm1
262    packsswb        xm3, xm3
263    pextrb    [bufq+xq], xm3, 0
264    pslldq          xm3, 2
265    vpblendvb       xm0, xm3, xm11
266    psrldq          xm0, 1
267    inc              xq
268    jz .x_loop_ar2_end
269    test             xb, 3
270    jnz .x_loop_ar2_inner
271    jmp .x_loop_ar2
272.x_loop_ar2_end:
273    add            bufq, 82
274    dec              hd
275    jg .y_loop_ar2
276    RET
277
278INIT_YMM avx2
279.ar3:
280%if WIN64
281    ALLOC_STACK   16*14
282    %assign stack_size stack_size - 16*4
283    WIN64_PUSH_XMM   12, 8
284%else
285    ALLOC_STACK   16*12
286%endif
287    mov             r6d, [fg_dataq+FGData.ar_coeff_shift]
288    movd           xm11, [base+byte_blend]
289    pmovsxbw         m1, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
290    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
291    pshufd           m0, m1, q0000
292    mova    [rsp+16* 0], m0
293    pshufd           m0, m1, q1111
294    mova    [rsp+16* 2], m0
295    pshufd           m0, m1, q2222
296    mova    [rsp+16* 4], m0
297    pshufd           m1, m1, q3333
298    mova    [rsp+16* 6], m1
299    pshufd          xm0, xm2, q0000
300    mova    [rsp+16* 8], xm0
301    pshufd          xm0, xm2, q1111
302    mova    [rsp+16* 9], xm0
303    psrldq          xm7, xm2, 10
304    mova             m8, [base+gen_shufA]
305    pinsrw          xm2, [base+pw_1], 5
306    mova             m9, [base+gen_shufC]
307    pshufd          xm2, xm2, q2222
308    movu            m10, [base+gen_shufE]
309    vpbroadcastw    xm6, [base+round_vals-12+r6*2]
310    pinsrw          xm7, [base+round_vals+r6*2-10], 3
311    mova    [rsp+16*10], xm2
312    DEFINE_ARGS buf, fg_data, h, x
313    sub            bufq, 82*73-(82*3+79)
314    mov              hd, 70
315.y_loop_ar3:
316    mov              xq, -76
317.x_loop_ar3:
318    movu            xm5, [bufq+xq-82*3-3]    ; y=-3,x=[-3,+12]
319    vinserti128      m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12]
320    movu            xm4, [bufq+xq-82*1-3]    ; y=-1,x=[-3,+12]
321    punpcklbw        m3, m5, m5
322    punpckhwd        m5, m4
323    psraw            m3, 8
324    punpcklbw        m5, m5
325    psraw            m5, 8
326    punpcklbw       xm4, xm4
327    psraw           xm4, 8
328    pshufb           m0, m3, m8
329    pmaddwd          m0, [rsp+16*0]
330    pshufb           m1, m3, m9
331    pmaddwd          m1, [rsp+16*2]
332    shufps           m2, m3, m5, q1032
333    paddd            m0, m1
334    pshufb           m1, m2, m8
335    vperm2i128       m3, m4, 0x21
336    pmaddwd          m1, [rsp+16*4]
337    shufps          xm2, xm3, q1021
338    vpblendd         m2, m3, 0xf0
339    pshufb           m2, m10
340    paddd            m0, m1
341    pmaddwd          m2, [rsp+16*6]
342    pshufb          xm1, xm4, xm9
343    pmaddwd         xm1, [rsp+16*8]
344    shufps          xm4, xm5, q1132
345    paddd            m0, m2
346    pshufb          xm2, xm4, xm8
347    pshufd          xm4, xm4, q2121
348    pmaddwd         xm2, [rsp+16*9]
349    punpcklwd       xm4, xm6
350    pmaddwd         xm4, [rsp+16*10]
351    vextracti128    xm3, m0, 1
352    paddd           xm0, xm1
353    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
354    paddd           xm2, xm4
355    paddd           xm0, xm2
356    paddd           xm0, xm3
357.x_loop_ar3_inner:
358    pmovsxbw        xm2, xm1
359    pmaddwd         xm2, xm7
360    pshufd          xm3, xm2, q1111
361    paddd           xm2, xm0                ; add top
362    paddd           xm2, xm3                ; left+cur
363    psrldq          xm0, 4
364    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
365    ; don't packssdw since we only care about one value
366    packsswb        xm2, xm2
367    pextrb    [bufq+xq], xm2, 0
368    pslldq          xm2, 3
369    vpblendvb       xm1, xm2, xm11
370    psrldq          xm1, 1
371    inc              xq
372    jz .x_loop_ar3_end
373    test             xb, 3
374    jnz .x_loop_ar3_inner
375    jmp .x_loop_ar3
376.x_loop_ar3_end:
377    add            bufq, 82
378    dec              hd
379    jg .y_loop_ar3
380    RET
381
382%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
383INIT_XMM avx2
384cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv
385%define base r4-generate_grain_uv_%1_8bpc_avx2_table
386    lea              r4, [generate_grain_uv_%1_8bpc_avx2_table]
387    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
388    mov             r6d, [fg_dataq+FGData.grain_scale_shift]
389    movq            xm1, [base+next_upperbit_mask]
390    movq            xm4, [base+mul_bits]
391    movq            xm5, [base+hmul_bits]
392    mova            xm6, [base+pb_mask]
393    vpbroadcastw    xm7, [base+round+r6*2]
394    vpbroadcastd    xm2, [base+pw_seed_xor+uvq*4]
395    pxor            xm0, xm2
396    lea              r6, [gaussian_sequence]
397%if %2
398    mov             r7d, 73-35*%3
399    add            bufq, 44
400.loop_y:
401    mov              r5, -44
402%else
403    mov              r5, -73*82
404    sub            bufq, r5
405%endif
406.loop:
407    pand            xm2, xm0, xm1
408    psrlw           xm3, xm2, 10
409    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
410    pmullw          xm2, xm4            ; bits 0x0f00 are set
411    pmulhuw         xm0, xm5
412    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
413    psllq           xm2, xm3, 30
414    por             xm2, xm3
415    psllq           xm3, xm2, 15
416    por             xm2, xm0            ; aggregate each bit into next seed's high bit
417    por             xm2, xm3            ; 4 next output seeds
418    pshuflw         xm0, xm2, q3333
419    psrlw           xm2, 5
420    movq             r8, xm2
421    movzx           r9d, r8w
422    movd            xm2, [r6+r9*2]
423    rorx             r9, r8, 32
424    shr             r8d, 16
425    pinsrw          xm2, [r6+r8*2], 1
426    movzx           r8d, r9w
427    pinsrw          xm2, [r6+r8*2], 2
428    shr             r9d, 16
429    pinsrw          xm2, [r6+r9*2], 3
430    pmulhrsw        xm2, xm7
431    packsswb        xm2, xm2
432    movd      [bufq+r5], xm2
433    add              r5, 4
434    jl .loop
435%if %2
436    add            bufq, 82
437    dec             r7d
438    jg .loop_y
439%endif
440
441    ; auto-regression code
442    movsxd           r6, [fg_dataq+FGData.ar_coeff_lag]
443    movsxd           r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4]
444    add              r6, r4
445    jmp              r6
446
447INIT_YMM avx2
448.ar0:
449    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
450    imul            uvd, 28
451    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
452    movd            xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq]
453    movd            xm3, [base+hmul_bits+shiftq*2]
454    DEFINE_ARGS buf, bufy, h
455    pmovsxbw        xm2, xm2
456%if %2
457    vpbroadcastd     m7, [base+pb_1]
458    vpbroadcastw     m6, [base+hmul_bits+2+%3*2]
459%endif
460    vpbroadcastw     m2, xm2
461    vpbroadcastw     m3, xm3
462    pxor            m12, m12
463%if %2
464    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
465%else
466    sub            bufq, 82*70-3
467%endif
468    add           bufyq, 3+82*3
469    mov              hd, 70-35*%3
470.y_loop_ar0:
471%if %2
472    ; first 32 pixels
473    movu            xm4, [bufyq]
474    vinserti128      m4, [bufyq+32], 1
475%if %3
476    movu            xm0, [bufyq+82]
477    vinserti128      m0, [bufyq+82+32], 1
478%endif
479    movu            xm5, [bufyq+16]
480    vinserti128      m5, [bufyq+48], 1
481%if %3
482    movu            xm1, [bufyq+82+16]
483    vinserti128      m1, [bufyq+82+48], 1
484%endif
485    pmaddubsw        m4, m7, m4
486%if %3
487    pmaddubsw        m0, m7, m0
488%endif
489    pmaddubsw        m5, m7, m5
490%if %3
491    pmaddubsw        m1, m7, m1
492    paddw            m4, m0
493    paddw            m5, m1
494%endif
495    pmulhrsw         m4, m6
496    pmulhrsw         m5, m6
497%else
498    xor             r3d, r3d
499    ; first 32x2 pixels
500.x_loop_ar0:
501    movu             m4, [bufyq+r3]
502    pcmpgtb          m0, m12, m4
503    punpckhbw        m5, m4, m0
504    punpcklbw        m4, m0
505%endif
506    pmullw           m4, m2
507    pmullw           m5, m2
508    pmulhrsw         m4, m3
509    pmulhrsw         m5, m3
510%if %2
511    movu             m1, [bufq]
512%else
513    movu             m1, [bufq+r3]
514%endif
515    pcmpgtb          m8, m12, m1
516    punpcklbw        m0, m1, m8
517    punpckhbw        m1, m8
518    paddw            m0, m4
519    paddw            m1, m5
520    packsswb         m0, m1
521%if %2
522    movu         [bufq], m0
523%else
524    movu      [bufq+r3], m0
525    add             r3d, 32
526    cmp             r3d, 64
527    jl .x_loop_ar0
528%endif
529
530    ; last 6/12 pixels
531    movu            xm4, [bufyq+32*2]
532%if %2
533%if %3
534    movu            xm5, [bufyq+32*2+82]
535%endif
536    pmaddubsw       xm4, xm7, xm4
537%if %3
538    pmaddubsw       xm5, xm7, xm5
539    paddw           xm4, xm5
540%endif
541    movq            xm0, [bufq+32]
542    pmulhrsw        xm4, xm6
543    pmullw          xm4, xm2
544    pmulhrsw        xm4, xm3
545    pcmpgtb         xm5, xm12, xm0
546    punpcklbw       xm5, xm0, xm5
547    paddw           xm4, xm5
548    packsswb        xm4, xm4
549    pblendw         xm0, xm4, xm0, 1000b
550    movq      [bufq+32], xm0
551%else
552    movu            xm0, [bufq+64]
553    pcmpgtb         xm1, xm12, xm4
554    punpckhbw       xm5, xm4, xm1
555    punpcklbw       xm4, xm1
556    pmullw          xm5, xm2
557    pmullw          xm4, xm2
558    vpblendd        xm1, xm3, xm12, 0x0c
559    pmulhrsw        xm5, xm1
560    pmulhrsw        xm4, xm3
561    pcmpgtb         xm1, xm12, xm0
562    punpckhbw       xm8, xm0, xm1
563    punpcklbw       xm0, xm1
564    paddw           xm5, xm8
565    paddw           xm0, xm4
566    packsswb        xm0, xm5
567    movu      [bufq+64], xm0
568%endif
569    add            bufq, 82
570    add           bufyq, 82<<%3
571    dec              hd
572    jg .y_loop_ar0
573    RET
574
575INIT_XMM avx2
576.ar1:
577    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
578    imul            uvd, 28
579    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
580    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
581    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
582    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
583    DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
584    pmovsxbw        xm4, xm4
585    pshufd          xm5, xm4, q1111
586    pshufd          xm4, xm4, q0000
587    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
588%if %2
589    vpbroadcastd    xm7, [base+pb_1]
590    vpbroadcastw    xm6, [base+hmul_bits+2+%3*2]
591%endif
592    vpbroadcastd    xm3, xm3
593%if %2
594    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
595%else
596    sub            bufq, 82*70-(82-3)
597%endif
598    add           bufyq, 79+82*3
599    mov              hd, 70-35*%3
600    mov            mind, -128
601    mov            maxd, 127
602.y_loop_ar1:
603    mov              xq, -(76>>%2)
604    movsx         val3d, byte [bufq+xq-1]
605.x_loop_ar1:
606    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
607%if %2
608    movq            xm8, [bufyq+xq*2]
609%if %3
610    movq            xm9, [bufyq+xq*2+82]
611%endif
612%endif
613    psrldq          xm2, xm0, 2             ; top
614    psrldq          xm1, xm0, 4             ; top/right
615%if %2
616    pmaddubsw       xm8, xm7, xm8
617%if %3
618    pmaddubsw       xm9, xm7, xm9
619    paddw           xm8, xm9
620%endif
621    pmulhrsw        xm8, xm6
622%else
623    pmovsxbw        xm8, [bufyq+xq]
624%endif
625    punpcklwd       xm0, xm2
626    punpcklwd       xm1, xm8
627    pmaddwd         xm0, xm4
628    pmaddwd         xm1, xm5
629    paddd           xm0, xm1
630    paddd           xm0, xm3
631.x_loop_ar1_inner:
632    movd          val0d, xm0
633    psrldq          xm0, 4
634    imul          val3d, cf3d
635    add           val3d, val0d
636    sarx          val3d, val3d, shiftd
637    movsx         val0d, byte [bufq+xq]
638    add           val3d, val0d
639    cmp           val3d, maxd
640    cmovns        val3d, maxd
641    cmp           val3d, mind
642    cmovs         val3d, mind
643    mov  byte [bufq+xq], val3b
644    ; keep val3d in-place as left for next x iteration
645    inc              xq
646    jz .x_loop_ar1_end
647    test             xq, 3
648    jnz .x_loop_ar1_inner
649    jmp .x_loop_ar1
650
651.x_loop_ar1_end:
652    add            bufq, 82
653    add           bufyq, 82<<%3
654    dec              hd
655    jg .y_loop_ar1
656    RET
657
658.ar2:
659    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
660    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
661    imul            uvd, 28
662    vpbroadcastw   xm13, [base+round_vals-12+shiftq*2]
663    pmovsxbw        xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
664    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
665    pinsrw          xm0, [base+pw_1], 5
666%if %2
667    vpbroadcastw   xm12, [base+hmul_bits+2+%3*2]
668    vpbroadcastd   xm11, [base+pb_1]
669%endif
670    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
671    pshufd          xm4, xm7, q0000
672    pshufd          xm5, xm7, q3333
673    pshufd          xm6, xm7, q1111
674    pshufd          xm7, xm7, q2222
675    pshufd          xm8, xm0, q0000
676    pshufd          xm9, xm0, q1111
677    pshufd         xm10, xm0, q2222
678%if %2
679    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
680%else
681    sub            bufq, 82*70-(82-3)
682%endif
683    add           bufyq, 79+82*3
684    mov              hd, 70-35*%3
685.y_loop_ar2:
686    mov              xq, -(76>>%2)
687
688.x_loop_ar2:
689    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
690    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
691    pshufb          xm2, xm0, [base+gen_shufA]
692    pmaddwd         xm2, xm4
693    pshufb          xm3, xm1, [base+gen_shufB]
694    pmaddwd         xm3, xm5
695    paddd           xm2, xm3
696    pshufb          xm3, xm0, [base+gen_shufC]
697    pmaddwd         xm3, xm6
698    punpckhqdq      xm0, xm0                 ; y=-2,x=[+2,+5]
699    punpcklwd       xm0, xm1
700    pmaddwd         xm0, xm7
701    pshufb          xm1, [gen_shufD]
702    pmaddwd         xm1, xm8
703    paddd           xm2, xm3
704    paddd           xm0, xm1
705    paddd           xm2, xm0
706
707%if %2
708    movq            xm0, [bufyq+xq*2]
709%if %3
710    movq            xm3, [bufyq+xq*2+82]
711%endif
712    pmaddubsw       xm0, xm11, xm0
713%if %3
714    pmaddubsw       xm3, xm11, xm3
715    paddw           xm0, xm3
716%endif
717    pmulhrsw        xm0, xm12
718%else
719    pmovsxbw        xm0, [bufyq+xq]
720%endif
721    punpcklwd       xm0, xm13
722    pmaddwd         xm0, xm10
723    paddd           xm2, xm0
724
725    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
726.x_loop_ar2_inner:
727    pmovsxbw        xm0, xm0
728    pmaddwd         xm3, xm0, xm9
729    psrldq          xm0, 2
730    paddd           xm3, xm2
731    psrldq          xm2, 4                  ; shift top to next pixel
732    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
733    pslldq          xm3, 2
734    paddw           xm3, xm0
735    pblendw         xm0, xm3, 00000010b
736    packsswb        xm0, xm0
737    pextrb    [bufq+xq], xm0, 1
738    inc              xq
739    jz .x_loop_ar2_end
740    test             xb, 3
741    jnz .x_loop_ar2_inner
742    jmp .x_loop_ar2
743
744.x_loop_ar2_end:
745    add            bufq, 82
746    add           bufyq, 82<<%3
747    dec              hd
748    jg .y_loop_ar2
749    RET
750
751INIT_YMM avx2
752.ar3:
753    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
754    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
755    imul            uvd, 28
756    pmovsxbw         m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
757    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
758    vpbroadcastb    xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma]
759    movd           xm13, [base+round_vals-10+shiftq*2]
760    vpbroadcastd   xm14, [base+round_vals-14+shiftq*2]
761    pshufd           m6, m0, q0000
762    pshufd           m7, m0, q1111
763    pshufd           m8, m0, q2222
764    pshufd           m9, m0, q3333
765    pshufd         xm10, xm1, q0000
766    pshufd         xm11, xm1, q1111
767    pshufhw        xm12, xm1, q0000
768    psraw           xm2, 8
769    palignr        xm13, xm1, 10
770    punpckhwd      xm12, xm2                     ; interleave luma cf
771    psrld          xm14, 16
772    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
773%if %2
774    vpbroadcastw   xm15, [base+hmul_bits+2+%3*2]
775    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
776%else
777    sub            bufq, 82*70-(82-3)
778%endif
779    add           bufyq, 79+82*3
780    mov              hd, 70-35*%3
781.y_loop_ar3:
782    mov              xq, -(76>>%2)
783.x_loop_ar3:
784    vbroadcasti128   m3, [bufq+xq-82*2-3]         ; y=-2,x=[-3,+12
785    palignr         xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12]
786    vbroadcasti128   m4, [bufq+xq-82*1-3]    ; y=-1,x=[-3,+12]
787    vpblendd         m3, m1, 0x0f
788    pxor             m0, m0
789    pcmpgtb          m2, m0, m3
790    pcmpgtb          m0, m4
791    punpcklbw        m1, m3, m2
792    punpckhbw        m3, m2
793    punpcklbw        m2, m4, m0
794    punpckhbw       xm4, xm0
795    pshufb           m0, m1, [base+gen_shufA]
796    pmaddwd          m0, m6
797    pshufb           m5, m1, [base+gen_shufC]
798    pmaddwd          m5, m7
799    shufps           m1, m3, q1032
800    paddd            m0, m5
801    pshufb           m5, m1, [base+gen_shufA]
802    pmaddwd          m5, m8
803    shufps          xm1, xm3, q2121
804    vpblendd         m1, m2, 0xf0
805    pshufb           m1, [base+gen_shufE]
806    pmaddwd          m1, m9
807    paddd            m0, m5
808    pshufb          xm3, xm2, [base+gen_shufC]
809    paddd            m0, m1
810    pmaddwd         xm3, xm10
811    palignr         xm1, xm4, xm2, 2
812    punpckhwd       xm1, xm2, xm1
813    pmaddwd         xm1, xm11
814    palignr         xm4, xm2, 12
815    paddd           xm3, xm1
816%if %2
817    vpbroadcastd    xm5, [base+pb_1]
818    movq            xm1, [bufyq+xq*2]
819    pmaddubsw       xm1, xm5, xm1
820%if %3
821    movq            xm2, [bufyq+xq*2+82]
822    pmaddubsw       xm5, xm2
823    paddw           xm1, xm5
824%endif
825    pmulhrsw        xm1, xm15
826%else
827    pmovsxbw        xm1, [bufyq+xq]
828%endif
829    punpcklwd       xm4, xm1
830    pmaddwd         xm4, xm12
831    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
832    vextracti128    xm2, m0, 1
833    paddd           xm0, xm14
834    paddd           xm3, xm4
835    paddd           xm0, xm3
836    paddd           xm0, xm2
837.x_loop_ar3_inner:
838    pmovsxbw        xm1, xm1
839    pmaddwd         xm2, xm13, xm1
840    pshuflw         xm3, xm2, q1032
841    paddd           xm2, xm0                ; add top
842    paddd           xm2, xm3                ; left+cur
843    psrldq          xm0, 4
844    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
845    psrldq          xm1, 2
846    ; don't packssdw, we only care about one value
847    punpckldq       xm2, xm2
848    pblendw         xm1, xm2, 0100b
849    packsswb        xm1, xm1
850    pextrb    [bufq+xq], xm1, 2
851    inc              xq
852    jz .x_loop_ar3_end
853    test             xb, 3
854    jnz .x_loop_ar3_inner
855    jmp .x_loop_ar3
856.x_loop_ar3_end:
857    add            bufq, 82
858    add           bufyq, 82<<%3
859    dec              hd
860    jg .y_loop_ar3
861    RET
862%endmacro
863
864INIT_YMM avx2
865cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
866                                     grain_lut, h, sby, see, overlap
867%define base r9-pd_m65536
868    lea              r9, [pd_m65536]
869    mov             r6d, [fg_dataq+FGData.scaling_shift]
870    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
871    mov            sbyd, sbym
872    mov        overlapd, [fg_dataq+FGData.overlap_flag]
873    vpbroadcastd     m8, [base+pd_m65536]
874    vpbroadcastw     m9, [base+mul_bits+r6*2-14]
875    vpbroadcastd    m10, [base+fg_min+r7*4]
876    vpbroadcastd    m11, [base+fg_max+r7*8]
877    vpbroadcastd    m12, [base+pw_1024]
878    movq           xm13, [base+pb_27_17_17_27]
879    test           sbyd, sbyd
880    setnz           r7b
881    pxor             m7, m7
882    test            r7b, overlapb
883    jnz .vertical_overlap
884
885    imul           seed, sbyd, (173 << 24) | 37
886    add            seed, (105 << 24) | 178
887    rorx           seed, seed, 24
888    movzx          seed, seew
889    xor            seed, [fg_dataq+FGData.seed]
890
891    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
892                offx, offy, see, overlap
893
894    lea        src_bakq, [srcq+wq]
895    neg              wq
896    sub            dstq, srcq
897
898.loop_x:
899    rorx             r6, seeq, 1
900    or             seed, 0xEFF4
901    test           seeb, seeh
902    lea            seed, [r6+0x8000]
903    cmovp          seed, r6d                ; updated seed
904
905    rorx          offyd, seed, 8
906    rorx          offxq, seeq, 12
907    and           offyd, 0xf
908    imul          offyd, 164
909    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
910
911    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
912                h, offxy, see, overlap
913
914    mov              hd, hm
915    mov      grain_lutq, grain_lutmp
916.loop_y:
917    ; src
918    mova             m2, [srcq]
919    punpcklbw        m0, m2, m7
920    punpckhbw        m1, m2, m7
921
922    ; scaling[src]
923    pandn            m4, m8, m0
924    mova             m6, m8
925    vpgatherdd       m2, [scalingq+m4-0], m8
926    psrld            m3, m0, 16
927    mova             m8, m6
928    vpgatherdd       m4, [scalingq+m3-2], m6
929    pandn            m5, m8, m1
930    mova             m6, m8
931    vpgatherdd       m3, [scalingq+m5-0], m8
932    pblendw          m2, m4, 0xaa
933    psrld            m4, m1, 16
934    mova             m8, m6
935    vpgatherdd       m5, [scalingq+m4-2], m6
936    pblendw          m3, m5, 0xaa
937
938    ; grain = grain_lut[offy+y][offx+x]
939    movu             m5, [grain_lutq+offxyq]
940    punpcklbw        m4, m5, m7
941    punpckhbw        m5, m7
942
943    ; noise = round2(scaling[src] * grain, scaling_shift)
944    pmaddubsw        m2, m4
945    pmaddubsw        m3, m5
946    pmulhrsw         m2, m9
947    pmulhrsw         m3, m9
948
949    ; dst = clip_pixel(src, noise)
950    paddw            m0, m2
951    paddw            m1, m3
952    packuswb         m0, m1
953    pmaxub           m0, m10
954    pminub           m0, m11
955    mova    [dstq+srcq], m0
956
957    add            srcq, strideq
958    add      grain_lutq, 82
959    dec              hd
960    jg .loop_y
961
962    add              wq, 32
963    jge .end
964    lea            srcq, [src_bakq+wq]
965    test       overlapd, overlapd
966    jz .loop_x
967
968    ; r8m = sbym
969    cmp       dword r8m, 0
970    jne .loop_x_hv_overlap
971
972    ; horizontal overlap (without vertical overlap)
973.loop_x_h_overlap:
974    rorx             r6, seeq, 1
975    or             seed, 0xEFF4
976    test           seeb, seeh
977    lea            seed, [r6+0x8000]
978    cmovp          seed, r6d                ; updated seed
979
980    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
981                offx, offy, see, left_offxy
982
983    lea     left_offxyd, [offyq+32]         ; previous column's offy*stride+offx
984    rorx          offyd, seed, 8
985    rorx          offxq, seeq, 12
986    and           offyd, 0xf
987    imul          offyd, 164
988    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
989
990    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
991                h, offxy, see, left_offxy
992
993    mov      grain_lutq, grain_lutmp
994    mov              hd, hm
995.loop_y_h_overlap:
996    ; src
997    mova             m2, [srcq]
998    punpcklbw        m0, m2, m7
999    punpckhbw        m1, m2, m7
1000
1001    ; scaling[src]
1002    pandn            m4, m8, m0
1003    mova             m6, m8
1004    vpgatherdd       m2, [scalingq+m4-0], m8
1005    psrld            m3, m0, 16
1006    mova             m8, m6
1007    vpgatherdd       m4, [scalingq+m3-2], m6
1008    pandn            m5, m8, m1
1009    mova             m6, m8
1010    vpgatherdd       m3, [scalingq+m5-0], m8
1011    pblendw          m2, m4, 0xaa
1012    psrld            m4, m1, 16
1013    mova             m8, m6
1014    vpgatherdd       m5, [scalingq+m4-2], m6
1015    pblendw          m3, m5, 0xaa
1016
1017    ; grain = grain_lut[offy+y][offx+x]
1018    movu             m5, [grain_lutq+offxyq]
1019    movd            xm4, [grain_lutq+left_offxyq]
1020    punpcklbw       xm4, xm5
1021    pmaddubsw       xm4, xm13, xm4
1022    pmulhrsw        xm4, xm12
1023    packsswb        xm4, xm4
1024    vpblendd         m4, m5, 0xfe
1025    punpckhbw        m5, m7
1026    punpcklbw        m4, m7
1027
1028    ; noise = round2(scaling[src] * grain, scaling_shift)
1029    pmaddubsw        m2, m4
1030    pmaddubsw        m3, m5
1031    pmulhrsw         m2, m9
1032    pmulhrsw         m3, m9
1033
1034    ; dst = clip_pixel(src, noise)
1035    paddw            m0, m2
1036    paddw            m1, m3
1037    packuswb         m0, m1
1038    pmaxub           m0, m10
1039    pminub           m0, m11
1040    mova    [dstq+srcq], m0
1041
1042    add            srcq, strideq
1043    add      grain_lutq, 82
1044    dec              hd
1045    jg .loop_y_h_overlap
1046
1047    add              wq, 32
1048    jge .end
1049    lea            srcq, [src_bakq+wq]
1050
1051    ; r8m = sbym
1052    cmp       dword r8m, 0
1053    jne .loop_x_hv_overlap
1054    jmp .loop_x_h_overlap
1055
1056.vertical_overlap:
1057    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1058                unused, sby, see, overlap
1059
1060    movzx          sbyd, sbyb
1061    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1062    imul            r7d, sbyd, 173 * 0x00010001
1063    imul           sbyd, 37 * 0x01000100
1064    add             r7d, (105 << 16) | 188
1065    add            sbyd, (178 << 24) | (141 << 8)
1066    and             r7d, 0x00ff00ff
1067    and            sbyd, 0xff00ff00
1068    xor            seed, r7d
1069    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1070
1071    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1072                offx, offy, see, overlap
1073
1074    lea        src_bakq, [srcq+wq]
1075    neg              wq
1076    sub            dstq, srcq
1077
1078.loop_x_v_overlap:
1079    vpbroadcastd    m14, [pb_27_17]
1080
1081    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1082    mov             r6d, seed
1083    or             seed, 0xeff4eff4
1084    test           seeb, seeh
1085    setp            r7b                     ; parity of top_seed
1086    shr            seed, 16
1087    shl             r7d, 16
1088    test           seeb, seeh
1089    setp            r7b                     ; parity of cur_seed
1090    or              r6d, 0x00010001
1091    xor             r7d, r6d
1092    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1093
1094    rorx          offyd, seed, 8
1095    rorx          offxd, seed, 12
1096    and           offyd, 0xf000f
1097    and           offxd, 0xf000f
1098    imul          offyd, 164
1099    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1100    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
1101
1102    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1103                h, offxy, see, overlap, top_offxy
1104
1105    mov      grain_lutq, grain_lutmp
1106    mov              hd, hm
1107    movzx    top_offxyd, offxyw
1108    shr          offxyd, 16
1109.loop_y_v_overlap:
1110    ; src
1111    mova             m2, [srcq]
1112    punpcklbw        m0, m2, m7
1113    punpckhbw        m1, m2, m7
1114
1115    ; scaling[src]
1116    pandn            m4, m8, m0
1117    mova             m6, m8
1118    vpgatherdd       m2, [scalingq+m4-0], m8
1119    psrld            m3, m0, 16
1120    mova             m8, m6
1121    vpgatherdd       m4, [scalingq+m3-2], m6
1122    pandn            m5, m8, m1
1123    mova             m6, m8
1124    vpgatherdd       m3, [scalingq+m5-0], m8
1125    pblendw          m2, m4, 0xaa
1126    psrld            m4, m1, 16
1127    mova             m8, m6
1128    vpgatherdd       m5, [scalingq+m4-2], m6
1129    pblendw          m3, m5, 0xaa
1130
1131    ; grain = grain_lut[offy+y][offx+x]
1132    movu             m6, [grain_lutq+offxyq]
1133    movu             m4, [grain_lutq+top_offxyq]
1134    punpcklbw        m5, m4, m6
1135    punpckhbw        m4, m6
1136    pmaddubsw        m5, m14, m5
1137    pmaddubsw        m4, m14, m4
1138    pmulhrsw         m5, m12
1139    pmulhrsw         m4, m12
1140    packsswb         m5, m4
1141    punpcklbw        m4, m5, m7
1142    punpckhbw        m5, m7
1143
1144    ; noise = round2(scaling[src] * grain, scaling_shift)
1145    pmaddubsw        m2, m4
1146    pmaddubsw        m3, m5
1147    pmulhrsw         m2, m9
1148    pmulhrsw         m3, m9
1149
1150    ; dst = clip_pixel(src, noise)
1151    paddw            m0, m2
1152    paddw            m1, m3
1153    packuswb         m0, m1
1154    pmaxub           m0, m10
1155    pminub           m0, m11
1156    mova    [dstq+srcq], m0
1157
1158    add            srcq, strideq
1159    add      grain_lutq, 82
1160    dec              hb
1161    jz .end_y_v_overlap
1162    vpbroadcastd    m14, [pb_17_27] ; swap weights for second v-overlap line
1163    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1164    ; remaining (up to) 30 lines
1165    add              hd, 0x80000000
1166    jnc .loop_y_v_overlap
1167    jmp .loop_y
1168.end_y_v_overlap:
1169    add              wq, 32
1170    jge .end
1171    lea            srcq, [src_bakq+wq]
1172
1173    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1174    ; back to .loop_x_v_overlap, and instead always fall-through to
1175    ; h+v overlap
1176.loop_x_hv_overlap:
1177    vpbroadcastd    m14, [pb_27_17]
1178
1179    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1180    mov             r6d, seed
1181    or             seed, 0xeff4eff4
1182    test           seeb, seeh
1183    setp            r7b                     ; parity of top_seed
1184    shr            seed, 16
1185    shl             r7d, 16
1186    test           seeb, seeh
1187    setp            r7b                     ; parity of cur_seed
1188    or              r6d, 0x00010001
1189    xor             r7d, r6d
1190    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1191
1192    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1193                offx, offy, see, left_offxy, top_offxy, topleft_offxy
1194
1195    lea  topleft_offxyd, [top_offxyq+32]
1196    lea     left_offxyd, [offyq+32]
1197    rorx          offyd, seed, 8
1198    rorx          offxd, seed, 12
1199    and           offyd, 0xf000f
1200    and           offxd, 0xf000f
1201    imul          offyd, 164
1202    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1203    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
1204
1205    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1206                h, offxy, see, left_offxy, top_offxy, topleft_offxy
1207
1208    mov      grain_lutq, grain_lutmp
1209    mov              hd, hm
1210    movzx    top_offxyd, offxyw
1211    shr          offxyd, 16
1212.loop_y_hv_overlap:
1213    ; src
1214    mova             m2, [srcq]
1215    punpcklbw        m0, m2, m7
1216    punpckhbw        m1, m2, m7
1217
1218    ; scaling[src]
1219    pandn            m4, m8, m0
1220    mova             m6, m8
1221    vpgatherdd       m2, [scalingq+m4-0], m8
1222    psrld            m3, m0, 16
1223    mova             m8, m6
1224    vpgatherdd       m4, [scalingq+m3-2], m6
1225    pandn            m5, m8, m1
1226    mova             m6, m8
1227    vpgatherdd       m3, [scalingq+m5-0], m8
1228    pblendw          m2, m4, 0xaa
1229    psrld            m4, m1, 16
1230    mova             m8, m6
1231    vpgatherdd       m5, [scalingq+m4-2], m6
1232    pblendw          m3, m5, 0xaa
1233
1234    ; grain = grain_lut[offy+y][offx+x]
1235    movu             m6, [grain_lutq+offxyq]
1236    movd            xm7, [grain_lutq+left_offxyq]
1237    movu             m4, [grain_lutq+top_offxyq]
1238    movd            xm5, [grain_lutq+topleft_offxyq]
1239    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1240    punpcklbw       xm7, xm6
1241    punpcklbw       xm5, xm4
1242    pmaddubsw       xm7, xm13, xm7
1243    pmaddubsw       xm5, xm13, xm5
1244    pmulhrsw        xm7, xm12
1245    pmulhrsw        xm5, xm12
1246    packsswb        xm7, xm7
1247    packsswb        xm5, xm5
1248    vpblendd         m7, m6, 0xfe
1249    vpblendd         m5, m4, 0xfe
1250    ; followed by v interpolation (top | cur -> cur)
1251    punpckhbw        m4, m6
1252    punpcklbw        m5, m7
1253    pmaddubsw        m4, m14, m4
1254    pmaddubsw        m5, m14, m5
1255    pmulhrsw         m4, m12
1256    pmulhrsw         m5, m12
1257    pxor             m7, m7
1258    packsswb         m5, m4
1259    punpcklbw        m4, m5, m7
1260    punpckhbw        m5, m7
1261
1262    ; noise = round2(scaling[src] * grain, scaling_shift)
1263    pmaddubsw        m2, m4
1264    pmaddubsw        m3, m5
1265    pmulhrsw         m2, m9
1266    pmulhrsw         m3, m9
1267
1268    ; dst = clip_pixel(src, noise)
1269    paddw            m0, m2
1270    paddw            m1, m3
1271    packuswb         m0, m1
1272    pmaxub           m0, m10
1273    pminub           m0, m11
1274    mova    [dstq+srcq], m0
1275
1276    add            srcq, strideq
1277    add      grain_lutq, 82
1278    dec              hb
1279    jz .end_y_hv_overlap
1280    vpbroadcastd    m14, [pb_17_27] ; swap weights for second v-overlap line
1281    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1282    ; remaining (up to) 30 lines
1283    add              hd, 0x80000000
1284    jnc .loop_y_hv_overlap
1285    jmp .loop_y_h_overlap
1286.end_y_hv_overlap:
1287    add              wq, 32
1288    lea            srcq, [src_bakq+wq]
1289    jl .loop_x_hv_overlap
1290.end:
1291    RET
1292
1293%macro FGUV_FN 3 ; name, ss_hor, ss_ver
1294cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
1295                                          grain_lut, h, sby, luma, overlap, uv_pl, is_id
1296%define base r11-pd_m65536
1297    lea             r11, [pd_m65536]
1298    mov             r6d, [fg_dataq+FGData.scaling_shift]
1299    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
1300    mov             r9d, is_idm
1301    mov            sbyd, sbym
1302    mov        overlapd, [fg_dataq+FGData.overlap_flag]
1303    vpbroadcastd     m8, [base+pd_m65536]
1304    vpbroadcastw     m9, [base+mul_bits+r6*2-14]
1305    vpbroadcastd    m10, [base+fg_min+r7*4]
1306    shlx            r7d, r7d, r9d
1307    vpbroadcastd    m11, [base+fg_max+r7*4]
1308    vpbroadcastd    m12, [base+pw_1024]
1309    pxor             m7, m7
1310    test           sbyd, sbyd
1311    setnz           r7b
1312    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
1313    jne .csfl
1314
1315%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
1316    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1317                h, sby, see, overlap, uv_pl
1318%if %1
1319    mov             r6d, uv_plm
1320    vpbroadcastd     m0, [base+pw_8]
1321    vbroadcasti128  m14, [fg_dataq+FGData.uv_mult+r6*4]
1322    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r6*4]
1323    pshufb          m14, m0 ; uv_luma_mult, uv_mult
1324%elif %2
1325    vpbroadcastq    m15, [base+pb_23_22]
1326%else
1327    vpbroadcastq   xm15, [base+pb_27_17_17_27]
1328%endif
1329%if %3
1330    vpbroadcastw    m13, [base+pb_23_22]
1331%elif %2
1332    pshufd          m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27
1333%endif
1334    test            r7b, overlapb
1335    jnz %%vertical_overlap
1336
1337    imul           seed, sbyd, (173 << 24) | 37
1338    add            seed, (105 << 24) | 178
1339    rorx           seed, seed, 24
1340    movzx          seed, seew
1341    xor            seed, [fg_dataq+FGData.seed]
1342
1343    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1344                unused2, unused3, see, overlap, unused4, unused5, lstride
1345
1346    mov           lumaq, r9mp
1347    lea             r12, [srcq+wq]
1348    lea             r13, [dstq+wq]
1349    lea             r14, [lumaq+wq*(1+%2)]
1350    mov           r11mp, r12
1351    mov           r12mp, r13
1352    mov        lstrideq, r10mp
1353    neg              wq
1354
1355%%loop_x:
1356    rorx             r6, seeq, 1
1357    or             seed, 0xEFF4
1358    test           seeb, seeh
1359    lea            seed, [r6+0x8000]
1360    cmovp          seed, r6d               ; updated seed
1361
1362    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1363                offx, offy, see, overlap, unused1, unused2, lstride
1364
1365    rorx          offyd, seed, 8
1366    rorx          offxq, seeq, 12
1367    and           offyd, 0xf
1368    imul          offyd, 164>>%3
1369    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
1370
1371    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1372                h, offxy, see, overlap, unused1, unused2, lstride
1373
1374    mov      grain_lutq, grain_lutmp
1375    mov              hd, hm
1376%%loop_y:
1377    ; src
1378%if %2
1379    mova            xm3, [lumaq+lstrideq*0+ 0]
1380    vinserti128      m3, [lumaq+lstrideq*(1+%3) +0], 1
1381    vpbroadcastd     m2, [pb_1]
1382    mova            xm0, [lumaq+lstrideq*0+16]
1383    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1384    mova            xm1, [srcq]
1385    vinserti128      m1, [srcq+strideq], 1
1386    pmaddubsw        m3, m2
1387    pmaddubsw        m0, m2
1388    pavgw            m3, m7
1389    pavgw            m0, m7
1390%else
1391    mova             m2, [lumaq]
1392    mova             m1, [srcq]
1393%endif
1394%if %1
1395%if %2
1396    packuswb         m2, m3, m0             ; luma
1397%endif
1398    punpckhbw        m3, m2, m1
1399    punpcklbw        m2, m1                 ; { luma, chroma }
1400    pmaddubsw        m3, m14
1401    pmaddubsw        m2, m14
1402    psraw            m3, 6
1403    psraw            m2, 6
1404    paddw            m3, m15
1405    paddw            m2, m15
1406    packuswb         m2, m3                 ; pack+unpack = clip
1407%endif
1408%if %1 || %2 == 0
1409    punpcklbw        m3, m2, m7
1410    punpckhbw        m0, m2, m7
1411%endif
1412
1413    ; scaling[luma_src]
1414    pandn            m4, m8, m3
1415    mova             m6, m8
1416    vpgatherdd       m2, [scalingq+m4-0], m8
1417    psrld            m3, 16
1418    mova             m8, m6
1419    vpgatherdd       m4, [scalingq+m3-2], m6
1420    pandn            m5, m8, m0
1421    mova             m6, m8
1422    vpgatherdd       m3, [scalingq+m5-0], m8
1423    psrld            m0, 16
1424    mova             m8, m6
1425    vpgatherdd       m5, [scalingq+m0-2], m6
1426    pblendw          m2, m4, 0xaa
1427    pblendw          m3, m5, 0xaa
1428
1429    ; grain = grain_lut[offy+y][offx+x]
1430%if %2
1431    movu            xm5, [grain_lutq+offxyq+ 0]
1432    vinserti128      m5, [grain_lutq+offxyq+82], 1
1433%else
1434    movu             m5, [grain_lutq+offxyq]
1435%endif
1436    punpcklbw        m4, m5, m7
1437    punpckhbw        m5, m7
1438
1439    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1440    pmaddubsw        m2, m4
1441    pmaddubsw        m3, m5
1442    pmulhrsw         m2, m9
1443    pmulhrsw         m3, m9
1444
1445    ; unpack chroma_source
1446    punpcklbw        m0, m1, m7
1447    punpckhbw        m1, m7
1448
1449    ; dst = clip_pixel(src, noise)
1450    paddw            m0, m2
1451    paddw            m1, m3
1452    packuswb         m0, m1
1453    pmaxub           m0, m10
1454    pminub           m0, m11
1455%if %2
1456    mova         [dstq], xm0
1457    vextracti128 [dstq+strideq], m0, 1
1458%else
1459    mova         [dstq], m0
1460%endif
1461
1462%if %2
1463    lea            srcq, [srcq+strideq*2]
1464    lea            dstq, [dstq+strideq*2]
1465    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1466%else
1467    add            srcq, strideq
1468    add            dstq, strideq
1469    add           lumaq, lstrideq
1470%endif
1471    add      grain_lutq, 82<<%2
1472    sub              hb, 1+%2
1473    jg %%loop_y
1474
1475    add              wq, 32>>%2
1476    jge .end
1477    mov            srcq, r11mp
1478    mov            dstq, r12mp
1479    lea           lumaq, [r14+wq*(1+%2)]
1480    add            srcq, wq
1481    add            dstq, wq
1482    test       overlapd, overlapd
1483    jz %%loop_x
1484
1485    ; r8m = sbym
1486    cmp       dword r8m, 0
1487    jne %%loop_x_hv_overlap
1488
1489    ; horizontal overlap (without vertical overlap)
1490%%loop_x_h_overlap:
1491    rorx             r6, seeq, 1
1492    or             seed, 0xEFF4
1493    test           seeb, seeh
1494    lea            seed, [r6+0x8000]
1495    cmovp          seed, r6d               ; updated seed
1496
1497    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1498                offx, offy, see, left_offxy, unused1, unused2, lstride
1499
1500    lea     left_offxyd, [offyq+(32>>%2)]         ; previous column's offy*stride+offx
1501    rorx          offyd, seed, 8
1502    rorx          offxq, seeq, 12
1503    and           offyd, 0xf
1504    imul          offyd, 164>>%3
1505    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
1506
1507    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1508                h, offxy, see, left_offxy, unused1, unused2, lstride
1509
1510    mov      grain_lutq, grain_lutmp
1511    mov              hd, hm
1512%%loop_y_h_overlap:
1513    ; src
1514%if %2
1515    mova            xm3, [lumaq+lstrideq*0+ 0]
1516    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
1517    vpbroadcastd     m2, [pb_1]
1518    mova            xm0, [lumaq+lstrideq*0+16]
1519    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1520    mova            xm1, [srcq]
1521    vinserti128      m1, [srcq+strideq], 1
1522    pmaddubsw        m3, m2
1523    pmaddubsw        m0, m2
1524    pavgw            m3, m7
1525    pavgw            m0, m7
1526%else
1527    mova             m2, [lumaq]
1528    mova             m1, [srcq]
1529%endif
1530%if %1
1531%if %2
1532    packuswb         m2, m3, m0             ; luma
1533%endif
1534    punpckhbw        m3, m2, m1
1535    punpcklbw        m2, m1                 ; { luma, chroma }
1536    pmaddubsw        m3, m14
1537    pmaddubsw        m2, m14
1538    psraw            m3, 6
1539    psraw            m2, 6
1540    paddw            m3, m15
1541    paddw            m2, m15
1542    packuswb         m2, m3                 ; pack+unpack = clip
1543%endif
1544%if %1 || %2 == 0
1545    punpcklbw        m3, m2, m7
1546    punpckhbw        m0, m2, m7
1547%endif
1548
1549    ; scaling[luma_src]
1550    pandn            m4, m8, m3
1551    mova             m6, m8
1552    vpgatherdd       m2, [scalingq+m4-0], m8
1553    psrld            m3, 16
1554    mova             m8, m6
1555    vpgatherdd       m4, [scalingq+m3-2], m6
1556    pandn            m5, m8, m0
1557    mova             m6, m8
1558    vpgatherdd       m3, [scalingq+m5-0], m8
1559    psrld            m0, 16
1560    mova             m8, m6
1561    vpgatherdd       m5, [scalingq+m0-2], m6
1562    pblendw          m2, m4, 0xaa
1563    pblendw          m3, m5, 0xaa
1564
1565    ; grain = grain_lut[offy+y][offx+x]
1566%if %2
1567    movu            xm5, [grain_lutq+offxyq+ 0]
1568    vinserti128      m5, [grain_lutq+offxyq+82], 1
1569    movd            xm4, [grain_lutq+left_offxyq+ 0]
1570    vinserti128      m4, [grain_lutq+left_offxyq+82], 1
1571    punpcklbw        m4, m5
1572%if %1
1573    vpbroadcastq     m0, [pb_23_22]
1574    pmaddubsw        m4, m0, m4
1575%else
1576    pmaddubsw        m4, m15, m4
1577%endif
1578    pmulhrsw         m4, m12
1579    packsswb         m4, m4
1580    vpblendd         m4, m5, 0xee
1581%else
1582    movu             m5, [grain_lutq+offxyq]
1583    movd            xm4, [grain_lutq+left_offxyq]
1584    punpcklbw       xm4, xm5
1585%if %1
1586    movq            xm0, [pb_27_17_17_27]
1587    pmaddubsw       xm4, xm0, xm4
1588%else
1589    pmaddubsw       xm4, xm15, xm4
1590%endif
1591    pmulhrsw        xm4, xm12
1592    packsswb        xm4, xm4
1593    vpblendd         m4, m5, 0xfe
1594%endif
1595    punpckhbw        m5, m7
1596    punpcklbw        m4, m7
1597
1598    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1599    pmaddubsw        m2, m4
1600    pmaddubsw        m3, m5
1601    pmulhrsw         m2, m9
1602    pmulhrsw         m3, m9
1603
1604    ; unpack chroma_source
1605    punpcklbw        m0, m1, m7
1606    punpckhbw        m1, m7
1607
1608    ; dst = clip_pixel(src, noise)
1609    paddw            m0, m2
1610    paddw            m1, m3
1611    packuswb         m0, m1
1612    pmaxub           m0, m10
1613    pminub           m0, m11
1614%if %2
1615    mova         [dstq], xm0
1616    vextracti128 [dstq+strideq], m0, 1
1617%else
1618    mova         [dstq], m0
1619%endif
1620
1621%if %2
1622    lea            srcq, [srcq+strideq*2]
1623    lea            dstq, [dstq+strideq*2]
1624    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1625%else
1626    add            srcq, strideq
1627    add            dstq, strideq
1628    add           lumaq, lstrideq
1629%endif
1630    add      grain_lutq, 82*(1+%2)
1631    sub              hb, 1+%2
1632    jg %%loop_y_h_overlap
1633
1634    add              wq, 32>>%2
1635    jge .end
1636    mov            srcq, r11mp
1637    mov            dstq, r12mp
1638    lea           lumaq, [r14+wq*(1+%2)]
1639    add            srcq, wq
1640    add            dstq, wq
1641
1642    ; r8m = sbym
1643    cmp       dword r8m, 0
1644    jne %%loop_x_hv_overlap
1645    jmp %%loop_x_h_overlap
1646
1647%%vertical_overlap:
1648    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
1649                sby, see, overlap, unused1, unused2, lstride
1650
1651    movzx          sbyd, sbyb
1652    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1653    imul            r7d, sbyd, 173 * 0x00010001
1654    imul           sbyd, 37 * 0x01000100
1655    add             r7d, (105 << 16) | 188
1656    add            sbyd, (178 << 24) | (141 << 8)
1657    and             r7d, 0x00ff00ff
1658    and            sbyd, 0xff00ff00
1659    xor            seed, r7d
1660    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1661
1662    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1663                unused1, unused2, see, overlap, unused3, unused4, lstride
1664
1665    mov           lumaq, r9mp
1666    lea             r12, [srcq+wq]
1667    lea             r13, [dstq+wq]
1668    lea             r14, [lumaq+wq*(1+%2)]
1669    mov           r11mp, r12
1670    mov           r12mp, r13
1671    mov        lstrideq, r10mp
1672    neg              wq
1673
1674%%loop_x_v_overlap:
1675    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1676    mov             r6d, seed
1677    or             seed, 0xeff4eff4
1678    test           seeb, seeh
1679    setp            r7b                     ; parity of top_seed
1680    shr            seed, 16
1681    shl             r7d, 16
1682    test           seeb, seeh
1683    setp            r7b                     ; parity of cur_seed
1684    or              r6d, 0x00010001
1685    xor             r7d, r6d
1686    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1687
1688    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1689                offx, offy, see, overlap, top_offxy, unused, lstride
1690
1691    rorx          offyd, seed, 8
1692    rorx          offxd, seed, 12
1693    and           offyd, 0xf000f
1694    and           offxd, 0xf000f
1695    imul          offyd, 164>>%3
1696    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1697    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1698
1699    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1700                h, offxy, see, overlap, top_offxy, unused, lstride
1701
1702    mov      grain_lutq, grain_lutmp
1703    mov              hd, hm
1704    movzx    top_offxyd, offxyw
1705    shr          offxyd, 16
1706%if %2 == 0
1707    vpbroadcastd    m13, [pb_27_17]
1708%endif
1709%%loop_y_v_overlap:
1710    ; src
1711%if %2
1712    mova            xm3, [lumaq+lstrideq*0+ 0]
1713    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
1714    vpbroadcastd     m2, [pb_1]
1715    mova            xm0, [lumaq+lstrideq*0+16]
1716    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1717    mova            xm1, [srcq]
1718    vinserti128      m1, [srcq+strideq], 1
1719    pmaddubsw        m3, m2
1720    pmaddubsw        m0, m2
1721    pavgw            m3, m7
1722    pavgw            m0, m7
1723%else
1724    mova             m2, [lumaq]
1725    mova             m1, [srcq]
1726%endif
1727%if %1
1728%if %2
1729    packuswb         m2, m3, m0             ; luma
1730%endif
1731    punpckhbw        m3, m2, m1
1732    punpcklbw        m2, m1                 ; { luma, chroma }
1733    pmaddubsw        m3, m14
1734    pmaddubsw        m2, m14
1735    psraw            m3, 6
1736    psraw            m2, 6
1737    paddw            m3, m15
1738    paddw            m2, m15
1739    packuswb         m2, m3                 ; pack+unpack = clip
1740%endif
1741%if %1 || %2 == 0
1742    punpcklbw        m3, m2, m7
1743    punpckhbw        m0, m2, m7
1744%endif
1745
1746    ; scaling[luma_src]
1747    pandn            m4, m8, m3
1748    mova             m6, m8
1749    vpgatherdd       m2, [scalingq+m4-0], m8
1750    psrld            m3, 16
1751    mova             m8, m6
1752    vpgatherdd       m4, [scalingq+m3-2], m6
1753    pandn            m5, m8, m0
1754    mova             m6, m8
1755    vpgatherdd       m3, [scalingq+m5-0], m8
1756    psrld            m0, 16
1757    mova             m8, m6
1758    vpgatherdd       m5, [scalingq+m0-2], m6
1759    pblendw          m2, m4, 0xaa
1760    pblendw          m3, m5, 0xaa
1761
1762    ; grain = grain_lut[offy+y][offx+x]
1763%if %3 == 0
1764%if %2
1765    movu            xm0, [grain_lutq+offxyq]
1766    vinserti128      m0, [grain_lutq+offxyq+82], 1
1767    movu            xm4, [grain_lutq+top_offxyq]
1768    vinserti128      m4, [grain_lutq+top_offxyq+82], 1
1769%else
1770    movu             m0, [grain_lutq+offxyq]
1771    movu             m4, [grain_lutq+top_offxyq]
1772%endif
1773    punpcklbw        m5, m4, m0
1774    punpckhbw        m4, m0
1775    pmaddubsw        m5, m13, m5
1776    pmaddubsw        m4, m13, m4
1777    pmulhrsw         m5, m12
1778    pmulhrsw         m4, m12
1779    packsswb         m5, m4
1780%else
1781    movq            xm4, [grain_lutq+offxyq]
1782    vinserti128      m4, [grain_lutq+offxyq+8], 1
1783    movq            xm5, [grain_lutq+top_offxyq]
1784    vinserti128      m5, [grain_lutq+top_offxyq+8], 1
1785    punpcklbw        m5, m4
1786    pmaddubsw        m5, m13, m5
1787    pmulhrsw         m5, m12
1788    vextracti128    xm4, m5, 1
1789    packsswb        xm5, xm4
1790    ; only interpolate first line, insert second line unmodified
1791    vinserti128      m5, [grain_lutq+offxyq+82], 1
1792%endif
1793    punpcklbw        m4, m5, m7
1794    punpckhbw        m5, m7
1795
1796    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1797    pmaddubsw        m2, m4
1798    pmaddubsw        m3, m5
1799    pmulhrsw         m2, m9
1800    pmulhrsw         m3, m9
1801
1802    ; unpack chroma_source
1803    punpcklbw        m0, m1, m7
1804    punpckhbw        m1, m7
1805
1806    ; dst = clip_pixel(src, noise)
1807    paddw            m0, m2
1808    paddw            m1, m3
1809    packuswb         m0, m1
1810    pmaxub           m0, m10
1811    pminub           m0, m11
1812%if %2
1813    mova         [dstq], xm0
1814    vextracti128 [dstq+strideq], m0, 1
1815%else
1816    mova         [dstq], m0
1817%endif
1818
1819    sub              hb, 1+%2
1820    jle %%end_y_v_overlap
1821%if %2
1822    lea            srcq, [srcq+strideq*2]
1823    lea            dstq, [dstq+strideq*2]
1824    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1825%else
1826    add            srcq, strideq
1827    add            dstq, strideq
1828    add           lumaq, lstrideq
1829%endif
1830    add      grain_lutq, 82<<%2
1831%if %2 == 0
1832    vpbroadcastd    m13, [pb_17_27]
1833    add              hd, 0x80000000
1834    jnc %%loop_y_v_overlap
1835%endif
1836    jmp %%loop_y
1837
1838%%end_y_v_overlap:
1839    add              wq, 32>>%2
1840    jge .end
1841    mov            srcq, r11mp
1842    mov            dstq, r12mp
1843    lea           lumaq, [r14+wq*(1+%2)]
1844    add            srcq, wq
1845    add            dstq, wq
1846
1847    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1848    ; back to .loop_x_v_overlap, and instead always fall-through to
1849    ; h+v overlap
1850
1851%%loop_x_hv_overlap:
1852    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1853    mov             r6d, seed
1854    or             seed, 0xeff4eff4
1855    test           seeb, seeh
1856    setp            r7b                     ; parity of top_seed
1857    shr            seed, 16
1858    shl             r7d, 16
1859    test           seeb, seeh
1860    setp            r7b                     ; parity of cur_seed
1861    or              r6d, 0x00010001
1862    xor             r7d, r6d
1863    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1864
1865    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1866                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
1867
1868    lea  topleft_offxyd, [top_offxyq+(32>>%2)]
1869    lea     left_offxyd, [offyq+(32>>%2)]
1870    rorx          offyd, seed, 8
1871    rorx          offxd, seed, 12
1872    and           offyd, 0xf000f
1873    and           offxd, 0xf000f
1874    imul          offyd, 164>>%3
1875    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1876    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1877
1878    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1879                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
1880
1881    mov      grain_lutq, grain_lutmp
1882    mov              hd, hm
1883    movzx    top_offxyd, offxyw
1884    shr          offxyd, 16
1885%if %2 == 0
1886    vpbroadcastd    m13, [pb_27_17]
1887%endif
1888%%loop_y_hv_overlap:
1889    ; src
1890%if %2
1891    mova            xm3, [lumaq+lstrideq*0+ 0]
1892    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
1893    vpbroadcastd     m2, [pb_1]
1894    mova            xm0, [lumaq+lstrideq*0+16]
1895    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1896    mova            xm1, [srcq]
1897    vinserti128      m1, [srcq+strideq], 1
1898    pmaddubsw        m3, m2
1899    pmaddubsw        m0, m2
1900    pavgw            m3, m7
1901    pavgw            m0, m7
1902%else
1903    mova             m2, [lumaq]
1904    mova             m1, [srcq]
1905%endif
1906%if %1
1907%if %2
1908    packuswb         m2, m3, m0             ; luma
1909%endif
1910    punpckhbw        m3, m2, m1
1911    punpcklbw        m2, m1                 ; { luma, chroma }
1912    pmaddubsw        m3, m14
1913    pmaddubsw        m2, m14
1914    psraw            m3, 6
1915    psraw            m2, 6
1916    paddw            m3, m15
1917    paddw            m2, m15
1918    packuswb         m2, m3                 ; pack+unpack = clip
1919%endif
1920%if %1 || %2 == 0
1921    punpcklbw        m3, m2, m7
1922    punpckhbw        m0, m2, m7
1923%endif
1924
1925    ; scaling[luma_src]
1926    pandn            m4, m8, m3
1927    mova             m6, m8
1928    vpgatherdd       m2, [scalingq+m4-0], m8
1929    psrld            m3, 16
1930    mova             m8, m6
1931    vpgatherdd       m4, [scalingq+m3-2], m6
1932    pandn            m5, m8, m0
1933    mova             m6, m8
1934    vpgatherdd       m3, [scalingq+m5-0], m8
1935    psrld            m0, 16
1936    mova             m8, m6
1937    vpgatherdd       m5, [scalingq+m0-2], m6
1938    pblendw          m2, m4, 0xaa
1939    pblendw          m3, m5, 0xaa
1940
1941    ; grain = grain_lut[offy+y][offx+x]
1942%if %2
1943    movu            xm4, [grain_lutq+offxyq]
1944    vinserti128      m4, [grain_lutq+offxyq+82], 1
1945    movd            xm0, [grain_lutq+left_offxyq]
1946    vinserti128      m0, [grain_lutq+left_offxyq+82], 1
1947    movd            xm6, [grain_lutq+topleft_offxyq]
1948%if %3
1949    movq            xm5, [grain_lutq+top_offxyq]
1950    vinserti128      m5, [grain_lutq+top_offxyq+8], 1
1951%else
1952    vinserti128      m6, [grain_lutq+topleft_offxyq+82], 1
1953    movu            xm5, [grain_lutq+top_offxyq]
1954    vinserti128      m5, [grain_lutq+top_offxyq+82], 1
1955%endif
1956
1957    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1958    punpcklbw        m0, m4
1959%if %3
1960    punpcklbw       xm6, xm5
1961%else
1962    punpcklbw        m6, m5
1963%endif
1964    punpcklqdq       m0, m6
1965%if %1
1966    vpbroadcastq     m6, [pb_23_22]
1967    pmaddubsw        m0, m6, m0
1968%else
1969    pmaddubsw        m0, m15, m0
1970%endif
1971    pmulhrsw         m0, m12
1972    packsswb         m0, m0
1973    vpblendd         m4, m0, 0x11
1974%if %3
1975    pshuflw         xm0, xm0, q1032
1976    vpblendd         m5, m0, 0x01
1977%else
1978    pshuflw          m0, m0, q1032
1979    vpblendd         m5, m0, 0x11
1980%endif
1981%else
1982    movu             m4, [grain_lutq+offxyq]
1983    movd            xm0, [grain_lutq+left_offxyq]
1984    movu             m5, [grain_lutq+top_offxyq]
1985    movd            xm6, [grain_lutq+topleft_offxyq]
1986    punpcklbw       xm0, xm4
1987    punpcklbw       xm6, xm5
1988    punpcklqdq      xm0, xm6
1989%if %1
1990    vpbroadcastq    xm6, [pb_27_17_17_27]
1991    pmaddubsw       xm0, xm6, xm0
1992%else
1993    pmaddubsw       xm0, xm15, xm0
1994%endif
1995    pmulhrsw        xm0, xm12
1996    packsswb        xm0, xm0
1997    vpblendd         m4, m0, 0x01
1998    pshuflw         xm0, xm0, q1032
1999    vpblendd         m5, m0, 0x01
2000%endif
2001
2002    ; followed by v interpolation (top | cur -> cur)
2003%if %3
2004    vpermq           m0, m4, q3120
2005    punpcklbw        m5, m0
2006    pmaddubsw        m5, m13, m5
2007    pmulhrsw         m5, m12
2008    vextracti128    xm0, m5, 1
2009    packsswb        xm5, xm0
2010    vpblendd         m5, m4, 0xf0
2011%else
2012    punpckhbw        m0, m5, m4
2013    punpcklbw        m5, m4
2014    pmaddubsw        m4, m13, m0
2015    pmaddubsw        m5, m13, m5
2016    pmulhrsw         m4, m12
2017    pmulhrsw         m5, m12
2018    packsswb         m5, m4
2019%endif
2020    punpcklbw        m4, m5, m7
2021    punpckhbw        m5, m7
2022
2023    ; noise = round2(scaling[src] * grain, scaling_shift)
2024    pmaddubsw        m2, m4
2025    pmaddubsw        m3, m5
2026    pmulhrsw         m2, m9
2027    pmulhrsw         m3, m9
2028
2029    ; unpack chroma source
2030    punpcklbw        m0, m1, m7
2031    punpckhbw        m1, m7
2032
2033    ; dst = clip_pixel(src, noise)
2034    paddw            m0, m2
2035    paddw            m1, m3
2036    packuswb         m0, m1
2037    pmaxub           m0, m10
2038    pminub           m0, m11
2039%if %2
2040    mova         [dstq], xm0
2041    vextracti128 [dstq+strideq], m0, 1
2042%else
2043    mova         [dstq], m0
2044%endif
2045
2046%if %2
2047    lea            srcq, [srcq+strideq*2]
2048    lea            dstq, [dstq+strideq*2]
2049    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
2050%else
2051    add            srcq, strideq
2052    add            dstq, strideq
2053    add           lumaq, lstrideq
2054%endif
2055    add      grain_lutq, 82<<%2
2056    sub              hb, 1+%2
2057%if %2
2058    jg %%loop_y_h_overlap
2059%else
2060    je %%end_y_hv_overlap
2061    vpbroadcastd    m13, [pb_17_27]
2062    add              hd, 0x80000000
2063    jnc %%loop_y_hv_overlap
2064    jmp %%loop_y_h_overlap
2065%endif
2066
2067%%end_y_hv_overlap:
2068    add              wq, 32>>%2
2069    jge .end
2070    mov            srcq, r11mp
2071    mov            dstq, r12mp
2072    lea           lumaq, [r14+wq*(1+%2)]
2073    add            srcq, wq
2074    add            dstq, wq
2075    jmp %%loop_x_hv_overlap
2076%endmacro
2077
2078    %%FGUV_32x32xN_LOOP 1, %2, %3
2079.csfl:
2080    %%FGUV_32x32xN_LOOP 0, %2, %3
2081.end:
2082    RET
2083%endmacro
2084
2085GEN_GRAIN_UV_FN 420, 1, 1
2086FGUV_FN         420, 1, 1
2087GEN_GRAIN_UV_FN 422, 1, 0
2088FGUV_FN         422, 1, 0
2089GEN_GRAIN_UV_FN 444, 0, 0
2090FGUV_FN         444, 0, 0
2091
2092%endif ; ARCH_X86_64
2093