xref: /aosp_15_r20/external/libdav1d/src/x86/filmgrain16_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021-2022, VideoLAN and dav1d authors
2; Copyright © 2021-2022, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28%include "x86/filmgrain_common.asm"
29
30%if ARCH_X86_64
31
32SECTION_RODATA 16
33pb_mask:       db  0,128,128,  0,128,  0,  0,128,128,  0,  0,128,  0,128,128,  0
34gen_shufA:     db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
35gen_shufB:     db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
36next_upperbit_mask:    dw 0x100B, 0x2016, 0x402C, 0x8058
37pw_27_17_17_27:        dw 27, 17, 17, 27
38pw_23_22:              dw 23, 22, 0, 32
39pw_seed_xor:   times 2 dw 0xb524
40               times 2 dw 0x49d8
41gen_ar0_shift: times 4 db 128
42               times 4 db 64
43               times 4 db 32
44               times 4 db 16
45pd_16:                 dd 16
46pd_m65536:             dd -65536
47pb_1:          times 4 db 1
48grain_max:     times 2 dw  511
49               times 2 dw 2047
50grain_min:     times 2 dw -512
51               times 2 dw -2048
52fg_max:        times 2 dw 1023
53               times 2 dw 4095
54               times 2 dw 960
55               times 2 dw 3840
56               times 2 dw 940
57               times 2 dw 3760
58fg_min:        times 2 dw 0
59               times 2 dw 64
60               times 2 dw 256
61uv_offset_mul:         dd 256
62                       dd 1024
63hmul_bits:             dw 32768, 16384,  8192,  4096
64round:                 dw  2048,  1024,   512
65mul_bits:              dw   256,   128,    64,    32,    16,     8
66round_vals:            dw    32,    64,   128,   256,   512,  1024
67pb_8_9_0_1:            db 8, 9, 0, 1
68
69%macro JMP_TABLE 1-*
70    %xdefine %1_table %%table
71    %xdefine %%base %1_table
72    %xdefine %%prefix mangle(private_prefix %+ _%1)
73    %%table:
74    %rep %0 - 1
75        dd %%prefix %+ .ar%2 - %%base
76        %rotate 1
77    %endrep
78%endmacro
79
80JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3
81JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3
82JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3
83JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3
84
85SECTION .text
86
87%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
88
89INIT_YMM avx2
90cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax
91%define base r4-generate_grain_y_16bpc_avx2_table
92    lea              r4, [generate_grain_y_16bpc_avx2_table]
93    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
94    mov             r6d, [fg_dataq+FGData.grain_scale_shift]
95    movq            xm1, [base+next_upperbit_mask]
96    mov              r3, -73*82*2
97    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
98    lea             r7d, [bdmaxq+1]
99    movq            xm4, [base+mul_bits]
100    shr             r7d, 11             ; 0 for 10bpc, 2 for 12bpc
101    movq            xm5, [base+hmul_bits]
102    sub              r6, r7
103    mova            xm6, [base+pb_mask]
104    sub            bufq, r3
105    vpbroadcastw    xm7, [base+round+r6*2-2]
106    lea              r6, [gaussian_sequence]
107    movsxd           r5, [r4+r5*4]
108.loop:
109    pand            xm2, xm0, xm1
110    psrlw           xm3, xm2, 10
111    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
112    pmullw          xm2, xm4            ; bits 0x0f00 are set
113    pmulhuw         xm0, xm5
114    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
115    psllq           xm2, xm3, 30
116    por             xm2, xm3
117    psllq           xm3, xm2, 15
118    por             xm2, xm0            ; aggregate each bit into next seed's high bit
119    por             xm3, xm2            ; 4 next output seeds
120    pshuflw         xm0, xm3, q3333
121    psrlw           xm3, 5
122    pand            xm2, xm0, xm1
123    movq             r7, xm3
124    psrlw           xm3, xm2, 10
125    por             xm2, xm3
126    pmullw          xm2, xm4
127    pmulhuw         xm0, xm5
128    movzx           r8d, r7w
129    pshufb          xm3, xm6, xm2
130    psllq           xm2, xm3, 30
131    por             xm2, xm3
132    psllq           xm3, xm2, 15
133    por             xm0, xm2
134    movd            xm2, [r6+r8*2]
135    rorx             r8, r7, 32
136    por             xm3, xm0
137    shr             r7d, 16
138    pinsrw          xm2, [r6+r7*2], 1
139    pshuflw         xm0, xm3, q3333
140    movzx           r7d, r8w
141    psrlw           xm3, 5
142    pinsrw          xm2, [r6+r7*2], 2
143    shr             r8d, 16
144    movq             r7, xm3
145    pinsrw          xm2, [r6+r8*2], 3
146    movzx           r8d, r7w
147    pinsrw          xm2, [r6+r8*2], 4
148    rorx             r8, r7, 32
149    shr             r7d, 16
150    pinsrw          xm2, [r6+r7*2], 5
151    movzx           r7d, r8w
152    pinsrw          xm2, [r6+r7*2], 6
153    shr             r8d, 16
154    pinsrw          xm2, [r6+r8*2], 7
155    paddw           xm2, xm2            ; otherwise bpc=12 w/ grain_scale_shift=0
156    pmulhrsw        xm2, xm7            ; shifts by 0, which pmulhrsw does not support
157    mova      [bufq+r3], xm2
158    add              r3, 8*2
159    jl .loop
160
161    ; auto-regression code
162    add              r5, r4
163    jmp              r5
164
165.ar1:
166    DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
167    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
168    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
169    movd            xm4, [fg_dataq+FGData.ar_coeffs_y]
170    DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
171    pinsrb          xm4, [base+pb_1], 3
172    pmovsxbw        xm4, xm4
173    pshufd          xm5, xm4, q1111
174    pshufd          xm4, xm4, q0000
175    vpbroadcastw    xm3, [base+round_vals+shiftq*2-12]    ; rnd
176    sub            bufq, 2*(82*73-(82*3+79))
177    mov              hd, 70
178    sar            maxd, 1
179    mov            mind, maxd
180    xor            mind, -1
181.y_loop_ar1:
182    mov              xq, -76
183    movsx         val3d, word [bufq+xq*2-2]
184.x_loop_ar1:
185    movu            xm0, [bufq+xq*2-82*2-2]     ; top/left
186    psrldq          xm2, xm0, 2                 ; top
187    psrldq          xm1, xm0, 4                 ; top/right
188    punpcklwd       xm0, xm2
189    punpcklwd       xm1, xm3
190    pmaddwd         xm0, xm4
191    pmaddwd         xm1, xm5
192    paddd           xm0, xm1
193.x_loop_ar1_inner:
194    movd          val0d, xm0
195    psrldq          xm0, 4
196    imul          val3d, cf3d
197    add           val3d, val0d
198    sarx          val3d, val3d, shiftd
199    movsx         val0d, word [bufq+xq*2]
200    add           val3d, val0d
201    cmp           val3d, maxd
202    cmovg         val3d, maxd
203    cmp           val3d, mind
204    cmovl         val3d, mind
205    mov word [bufq+xq*2], val3w
206    ; keep val3d in-place as left for next x iteration
207    inc              xq
208    jz .x_loop_ar1_end
209    test             xb, 3
210    jnz .x_loop_ar1_inner
211    jmp .x_loop_ar1
212.x_loop_ar1_end:
213    add            bufq, 82*2
214    dec              hd
215    jg .y_loop_ar1
216.ar0:
217    RET
218
219.ar2:
220    DEFINE_ARGS buf, fg_data, bdmax, shift
221    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
222    movq            xm0, [fg_dataq+FGData.ar_coeffs_y+5]    ; cf5-11
223    vinserti128      m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4
224    vpbroadcastw   xm10, [base+round_vals-12+shiftq*2]
225    pxor             m1, m1
226    punpcklwd      xm10, xm1
227    pcmpgtb          m1, m0
228    punpcklbw        m0, m1                                 ; cf5-11,0-4
229    vpermq           m1, m0, q3333                          ; cf4
230    vbroadcasti128  m11, [base+gen_shufA]
231    pshufd           m6, m0, q0000                          ; cf[5,6], cf[0-1]
232    vbroadcasti128  m12, [base+gen_shufB]
233    pshufd           m7, m0, q1111                          ; cf[7,8], cf[2-3]
234    punpckhwd       xm1, xm0
235    pshufhw         xm9, xm0, q2121
236    pshufd          xm8, xm1, q0000                         ; cf[4,9]
237    sar          bdmaxd, 1
238    punpckhqdq      xm9, xm9                                ; cf[10,11]
239    movd            xm4, bdmaxd                             ; max_grain
240    pcmpeqd         xm5, xm5
241    sub            bufq, 2*(82*73-(82*3+79))
242    pxor            xm5, xm4                                ; min_grain
243    DEFINE_ARGS buf, fg_data, h, x
244    mov              hd, 70
245.y_loop_ar2:
246    mov              xq, -76
247.x_loop_ar2:
248    vbroadcasti128   m2, [bufq+xq*2-82*4-4]        ; y=-2,x=[-2,+5]
249    vinserti128      m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5]
250    pshufb           m0, m1, m11                   ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
251    pmaddwd          m0, m6
252    punpckhwd       xm2, xm1                       ; y=-2/-1 interleaved, x=[+2,+5]
253    pshufb           m1, m12                       ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
254    pmaddwd          m1, m7
255    pmaddwd         xm2, xm8
256    paddd            m0, m1
257    vextracti128    xm1, m0, 1
258    paddd           xm0, xm10
259    paddd           xm2, xm0
260    movu            xm0, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
261    paddd           xm2, xm1
262    pmovsxwd        xm1, [bufq+xq*2]        ; in dwords, y=0,x=[0,3]
263.x_loop_ar2_inner:
264    pmaddwd         xm3, xm9, xm0
265    psrldq          xm0, 2
266    paddd           xm3, xm2
267    psrldq          xm2, 4                  ; shift top to next pixel
268    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
269    ; skip packssdw because we only care about one value
270    paddd           xm3, xm1
271    pminsd          xm3, xm4
272    psrldq          xm1, 4
273    pmaxsd          xm3, xm5
274    pextrw  [bufq+xq*2], xm3, 0
275    punpcklwd       xm3, xm3
276    pblendw         xm0, xm3, 0010b
277    inc              xq
278    jz .x_loop_ar2_end
279    test             xb, 3
280    jnz .x_loop_ar2_inner
281    jmp .x_loop_ar2
282.x_loop_ar2_end:
283    add            bufq, 82*2
284    dec              hd
285    jg .y_loop_ar2
286    RET
287
288.ar3:
289    DEFINE_ARGS buf, fg_data, bdmax, shift
290    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
291    sar          bdmaxd, 1
292    movq            xm7, [fg_dataq+FGData.ar_coeffs_y+ 0]    ; cf0-6
293    movd            xm0, [fg_dataq+FGData.ar_coeffs_y+14]    ; cf14-16
294    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13
295    pinsrb          xm0, [base+pb_1], 3                      ; cf14-16,pb_1
296    movd            xm1, [fg_dataq+FGData.ar_coeffs_y+21]    ; cf21-23
297    vinserti128      m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13
298    vinserti128      m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20
299    vpbroadcastw   xm11, [base+round_vals+shiftq*2-12]
300    movd           xm12, bdmaxd                              ; max_grain
301    punpcklbw        m7, m7                                  ; sign-extension
302    punpcklbw        m0, m0                                  ; sign-extension
303    punpcklbw       xm1, xm1
304    REPX   {psraw x, 8}, m7, m0, xm1
305    pshufd           m4, m7, q0000                           ; cf[0,1] | cf[7,8]
306    pshufd           m5, m7, q1111                           ; cf[2,3] | cf[9,10]
307    pshufd           m6, m7, q2222                           ; cf[4,5] | cf[11,12]
308    pshufd          xm7, xm7, q3333                          ; cf[6,13]
309    pshufd           m8, m0, q0000                           ; cf[14,15] | cf[17,18]
310    pshufd           m9, m0, q1111                           ; cf[16],pw_1 | cf[19,20]
311    paddw           xm0, xm11, xm11
312    pcmpeqd        xm13, xm13
313    pblendw        xm10, xm1, xm0, 00001000b
314    pxor           xm13, xm12                                ; min_grain
315    DEFINE_ARGS buf, fg_data, h, x
316    sub            bufq, 2*(82*73-(82*3+79))
317    mov              hd, 70
318.y_loop_ar3:
319    mov              xq, -76
320.x_loop_ar3:
321    movu            xm0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
322    vinserti128      m0, [bufq+xq*2-82*4-6+ 0], 1   ; y=-3/-2,x=[-3,+4]
323    movq            xm1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+8]
324    vinserti128      m1, [bufq+xq*2-82*4-6+16], 1   ; y=-3/-2,x=[+5,+12]
325    palignr          m3, m1, m0, 2                  ; y=-3/-2,x=[-2,+5]
326    palignr          m1, m0, 12                     ; y=-3/-2,x=[+3,+6]
327    punpckhwd        m2, m0, m3                     ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
328    punpcklwd        m0, m3                         ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
329    shufps           m3, m0, m2, q1032              ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
330    pmaddwd          m0, m4
331    pmaddwd          m2, m6
332    pmaddwd          m3, m5
333    paddd            m0, m2
334    movu            xm2, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
335    vinserti128      m2, [bufq+xq*2-82*2-6+ 6], 1   ; y=-1,x=[+1,+8]
336    paddd            m0, m3
337    psrldq           m3, m2, 2
338    punpcklwd        m3, m2, m3                     ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
339    pmaddwd          m3, m8                         ;      x=[+0/+1,+1/+2,+2/+3,+3/+4]
340    paddd            m0, m3
341    psrldq           m3, m2, 4
342    psrldq           m2, 6
343    vpblendd         m2, m11, 0x0f                  ; rounding constant
344    punpcklwd        m3, m2                         ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
345    pmaddwd          m3, m9                         ;      x=[+2/+3,+3/+4,+4/+5,+5,+6]
346    vextracti128    xm2, m1, 1
347    punpcklwd       xm1, xm2
348    pmaddwd         xm1, xm7                        ; y=-3/-2 interleaved,x=[+3,+4,+5,+6]
349    paddd            m0, m3
350    vextracti128    xm2, m0, 1
351    paddd           xm0, xm1
352    movu            xm1, [bufq+xq*2-6]        ; y=0,x=[-3,+4]
353    paddd           xm0, xm2
354.x_loop_ar3_inner:
355    pmaddwd         xm2, xm1, xm10
356    pshuflw         xm3, xm2, q1032
357    paddd           xm2, xm0                ; add top
358    paddd           xm2, xm3                ; left+cur
359    psrldq          xm0, 4
360    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
361    ; skip packssdw because we only care about one value
362    pminsd          xm2, xm12
363    pmaxsd          xm2, xm13
364    pextrw  [bufq+xq*2], xm2, 0
365    pslldq          xm2, 4
366    psrldq          xm1, 2
367    pblendw         xm1, xm2, 0100b
368    inc              xq
369    jz .x_loop_ar3_end
370    test             xb, 3
371    jnz .x_loop_ar3_inner
372    jmp .x_loop_ar3
373.x_loop_ar3_end:
374    add            bufq, 82*2
375    dec              hd
376    jg .y_loop_ar3
377    RET
378
379%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
380INIT_XMM avx2
381cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax
382%define base r8-generate_grain_uv_%1_16bpc_avx2_table
383    lea              r8, [generate_grain_uv_%1_16bpc_avx2_table]
384    movifnidn    bdmaxd, bdmaxm
385    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
386    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
387    movq            xm1, [base+next_upperbit_mask]
388    lea             r6d, [bdmaxq+1]
389    movq            xm4, [base+mul_bits]
390    shr             r6d, 11             ; 0 for 10bpc, 2 for 12bpc
391    movq            xm5, [base+hmul_bits]
392    sub              r5, r6
393    mova            xm6, [base+pb_mask]
394    vpbroadcastd    xm2, [base+pw_seed_xor+uvq*4]
395    vpbroadcastw    xm7, [base+round+r5*2-2]
396    pxor            xm0, xm2
397    lea              r6, [gaussian_sequence]
398%if %2
399    mov             r7d, 73-35*%3
400    add            bufq, 44*2
401.loop_y:
402    mov              r5, -44*2
403%else
404    mov              r5, -82*73*2
405    sub            bufq, r5
406%endif
407.loop_x:
408    pand            xm2, xm0, xm1
409    psrlw           xm3, xm2, 10
410    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
411    pmullw          xm2, xm4            ; bits 0x0f00 are set
412    pmulhuw         xm0, xm5
413    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
414    psllq           xm2, xm3, 30
415    por             xm2, xm3
416    psllq           xm3, xm2, 15
417    por             xm2, xm0            ; aggregate each bit into next seed's high bit
418    por             xm2, xm3            ; 4 next output seeds
419    pshuflw         xm0, xm2, q3333
420    psrlw           xm2, 5
421    movq            r10, xm2
422    movzx           r9d, r10w
423    movd            xm2, [r6+r9*2]
424    rorx             r9, r10, 32
425    shr            r10d, 16
426    pinsrw          xm2, [r6+r10*2], 1
427    movzx          r10d, r9w
428    pinsrw          xm2, [r6+r10*2], 2
429    shr             r9d, 16
430    pinsrw          xm2, [r6+r9*2], 3
431    paddw           xm2, xm2            ; otherwise bpc=12 w/ grain_scale_shift=0
432    pmulhrsw        xm2, xm7            ; shifts by 0, which pmulhrsw does not support
433    movq      [bufq+r5], xm2
434    add              r5, 8
435    jl .loop_x
436%if %2
437    add            bufq, 82*2
438    dec             r7d
439    jg .loop_y
440%endif
441
442    ; auto-regression code
443    movsxd           r6, [fg_dataq+FGData.ar_coeff_lag]
444    movsxd           r6, [r8+r6*4]
445    add              r6, r8
446    jmp              r6
447
448INIT_YMM avx2
449.ar0:
450    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
451    imul            uvd, 28
452    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
453    vpbroadcastb     m0, [fg_dataq+FGData.ar_coeffs_uv+uvq]
454    sar          bdmaxd, 1
455    vpbroadcastd     m4, [base+gen_ar0_shift-24+shiftq*4]
456    movd            xm6, bdmaxd
457    pcmpeqw          m7, m7
458    pmaddubsw        m4, m0  ; ar_coeff << (14 - shift)
459    vpbroadcastw     m6, xm6 ; max_gain
460    pxor             m7, m6  ; min_grain
461    DEFINE_ARGS buf, bufy, h, x
462%if %2
463    vpbroadcastw     m5, [base+hmul_bits+2+%3*2]
464    sub            bufq, 2*(82*(73-35*%3)+82-(82*3+41))
465%else
466    sub            bufq, 2*(82*70-3)
467%endif
468    add           bufyq, 2*(3+82*3)
469    mov              hd, 70-35*%3
470.y_loop_ar0:
471%if %2
472    ; first 32 pixels
473    movu            xm0, [bufyq+16*0]
474    vinserti128      m0, [bufyq+16*2], 1
475    movu            xm1, [bufyq+16*1]
476    vinserti128      m1, [bufyq+16*3], 1
477%if %3
478    movu            xm2, [bufyq+82*2+16*0]
479    vinserti128      m2, [bufyq+82*2+16*2], 1
480    movu            xm3, [bufyq+82*2+16*1]
481    vinserti128      m3, [bufyq+82*2+16*3], 1
482    paddw            m0, m2
483    paddw            m1, m3
484%endif
485    phaddw           m0, m1
486    movu            xm1, [bufyq+16*4]
487    vinserti128      m1, [bufyq+16*6], 1
488    movu            xm2, [bufyq+16*5]
489    vinserti128      m2, [bufyq+16*7], 1
490%if %3
491    movu            xm3, [bufyq+82*2+16*4]
492    vinserti128      m3, [bufyq+82*2+16*6], 1
493    paddw            m1, m3
494    movu            xm3, [bufyq+82*2+16*5]
495    vinserti128      m3, [bufyq+82*2+16*7], 1
496    paddw            m2, m3
497%endif
498    phaddw           m1, m2
499    pmulhrsw         m0, m5
500    pmulhrsw         m1, m5
501%else
502    xor              xd, xd
503.x_loop_ar0:
504    movu             m0, [bufyq+xq*2]
505    movu             m1, [bufyq+xq*2+32]
506%endif
507    paddw            m0, m0
508    paddw            m1, m1
509    pmulhrsw         m0, m4
510    pmulhrsw         m1, m4
511%if %2
512    paddw            m0, [bufq+ 0]
513    paddw            m1, [bufq+32]
514%else
515    paddw            m0, [bufq+xq*2+ 0]
516    paddw            m1, [bufq+xq*2+32]
517%endif
518    pminsw           m0, m6
519    pminsw           m1, m6
520    pmaxsw           m0, m7
521    pmaxsw           m1, m7
522%if %2
523    movu      [bufq+ 0], m0
524    movu      [bufq+32], m1
525
526    ; last 6 pixels
527    movu            xm0, [bufyq+32*4]
528    movu            xm1, [bufyq+32*4+16]
529%if %3
530    paddw           xm0, [bufyq+32*4+82*2]
531    paddw           xm1, [bufyq+32*4+82*2+16]
532%endif
533    phaddw          xm0, xm1
534    movu            xm1, [bufq+32*2]
535    pmulhrsw        xm0, xm5
536    paddw           xm0, xm0
537    pmulhrsw        xm0, xm4
538    paddw           xm0, xm1
539    pminsw          xm0, xm6
540    pmaxsw          xm0, xm7
541    vpblendd        xm0, xm1, 0x08
542    movu    [bufq+32*2], xm0
543%else
544    movu [bufq+xq*2+ 0], m0
545    movu [bufq+xq*2+32], m1
546    add              xd, 32
547    cmp              xd, 64
548    jl .x_loop_ar0
549
550    ; last 12 pixels
551    movu             m0, [bufyq+64*2]
552    movu             m1, [bufq+64*2]
553    paddw            m0, m0
554    pmulhrsw         m0, m4
555    paddw            m0, m1
556    pminsw           m0, m6
557    pmaxsw           m0, m7
558    vpblendd         m0, m1, 0xc0
559    movu    [bufq+64*2], m0
560%endif
561    add            bufq, 82*2
562    add           bufyq, 82*2<<%3
563    dec              hd
564    jg .y_loop_ar0
565    RET
566
567INIT_XMM avx2
568.ar1:
569    DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift
570    imul            uvd, 28
571    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
572    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
573    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
574    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
575    DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift
576    pmovsxbw        xm4, xm4
577    pshufd          xm5, xm4, q1111
578    pshufd          xm4, xm4, q0000
579    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
580    vpbroadcastw    xm6, [base+hmul_bits+2+%3*2]
581    vpbroadcastd    xm3, xm3
582%if %2
583    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
584%else
585    sub            bufq, 2*(82*69+3)
586%endif
587    add           bufyq, 2*(79+82*3)
588    mov              hd, 70-35*%3
589    sar            maxd, 1
590    mov            mind, maxd
591    xor            mind, -1
592.y_loop_ar1:
593    mov              xq, -(76>>%2)
594    movsx         val3d, word [bufq+xq*2-2]
595.x_loop_ar1:
596    movu            xm0, [bufq+xq*2-82*2-2] ; top/left
597%if %2
598    movu            xm2, [bufyq+xq*4]
599%else
600    movq            xm2, [bufyq+xq*2]
601%endif
602%if %2
603%if %3
604    phaddw          xm2, [bufyq+xq*4+82*2]
605    punpckhqdq      xm1, xm2, xm2
606    paddw           xm2, xm1
607%else
608    phaddw          xm2, xm2
609%endif
610    pmulhrsw        xm2, xm6
611%endif
612    psrldq          xm1, xm0, 4             ; top/right
613    punpcklwd       xm1, xm2
614    psrldq          xm2, xm0, 2             ; top
615    punpcklwd       xm0, xm2
616    pmaddwd         xm1, xm5
617    pmaddwd         xm0, xm4
618    paddd           xm1, xm3
619    paddd           xm0, xm1
620.x_loop_ar1_inner:
621    movd          val0d, xm0
622    psrldq          xm0, 4
623    imul          val3d, cf3d
624    add           val3d, val0d
625    sarx          val3d, val3d, shiftd
626    movsx         val0d, word [bufq+xq*2]
627    add           val3d, val0d
628    cmp           val3d, maxd
629    cmovg         val3d, maxd
630    cmp           val3d, mind
631    cmovl         val3d, mind
632    mov word [bufq+xq*2], val3w
633    ; keep val3d in-place as left for next x iteration
634    inc              xq
635    jz .x_loop_ar1_end
636    test             xb, 3
637    jnz .x_loop_ar1_inner
638    jmp .x_loop_ar1
639.x_loop_ar1_end:
640    add            bufq, 82*2
641    add           bufyq, 82*2<<%3
642    dec              hd
643    jg .y_loop_ar1
644    RET
645
646INIT_YMM avx2
647.ar2:
648%if WIN64
649    %assign stack_size_padded 136
650    SUB             rsp, stack_size_padded
651    WIN64_PUSH_XMM 13 + %2, 8
652%endif
653    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
654    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
655    imul            uvd, 28
656    vbroadcasti128  m10, [base+gen_shufA]
657    sar          bdmaxd, 1
658    vbroadcasti128  m11, [base+gen_shufB]
659    movd            xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5]
660    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4
661    pinsrb          xm7, [base+pb_1], 5
662    pinsrw          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3
663    movhps          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
664    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13
665    pmovsxbw         m7, xm7
666    movd            xm8, bdmaxd             ; max_grain
667    pshufd           m4, m7, q0000
668    vpbroadcastw   xm12, [base+round_vals-12+shiftq*2]
669    pshufd           m5, m7, q1111
670    pcmpeqd         xm9, xm9
671    pshufd           m6, m7, q2222
672    pxor            xm9, xm8                ; min_grain
673    pshufd          xm7, xm7, q3333
674    DEFINE_ARGS buf, bufy, fg_data, h, x
675%if %2
676    vpbroadcastw   xm13, [base+hmul_bits+2+%3*2]
677    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
678%else
679    sub            bufq, 2*(82*69+3)
680%endif
681    add           bufyq, 2*(79+82*3)
682    mov              hd, 70-35*%3
683.y_loop_ar2:
684    mov              xq, -(76>>%2)
685.x_loop_ar2:
686    vbroadcasti128   m3, [bufq+xq*2-82*2-4]        ; y=-1,x=[-2,+5]
687    vinserti128      m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5]
688    pshufb           m0, m2, m10                   ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
689    pmaddwd          m0, m4
690    pshufb           m1, m2, m11                   ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
691    pmaddwd          m1, m5
692    punpckhwd        m2, m3                        ; y=-2/-1 interleaved, x=[+2,+5]
693%if %2
694    movu            xm3, [bufyq+xq*4]
695%if %3
696    paddw           xm3, [bufyq+xq*4+82*2]
697%endif
698    phaddw          xm3, xm3
699    pmulhrsw        xm3, xm13
700%else
701    movq            xm3, [bufyq+xq*2]
702%endif
703    punpcklwd       xm3, xm12                   ; luma, round interleaved
704    vpblendd         m2, m3, 0x0f
705    pmaddwd          m2, m6
706    paddd            m1, m0
707    movu            xm0, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
708    paddd            m2, m1
709    vextracti128    xm1, m2, 1
710    paddd           xm2, xm1
711    pshufd          xm1, xm0, q3321
712    pmovsxwd        xm1, xm1                ; y=0,x=[0,3] in dword
713.x_loop_ar2_inner:
714    pmaddwd         xm3, xm7, xm0
715    paddd           xm3, xm2
716    psrldq          xm2, 4                  ; shift top to next pixel
717    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
718    ; we do not need to packssdw since we only care about one value
719    paddd           xm3, xm1
720    psrldq          xm1, 4
721    pminsd          xm3, xm8
722    pmaxsd          xm3, xm9
723    pextrw  [bufq+xq*2], xm3, 0
724    psrldq          xm0, 2
725    pslldq          xm3, 2
726    pblendw         xm0, xm3, 00000010b
727    inc              xq
728    jz .x_loop_ar2_end
729    test             xb, 3
730    jnz .x_loop_ar2_inner
731    jmp .x_loop_ar2
732.x_loop_ar2_end:
733    add            bufq, 82*2
734    add           bufyq, 82*2<<%3
735    dec              hd
736    jg .y_loop_ar2
737    RET
738
739.ar3:
740%if WIN64
741    %assign stack_offset 32
742    %assign stack_size_padded 152
743    SUB             rsp, stack_size_padded
744    WIN64_PUSH_XMM 14 + %2, 8
745%endif
746    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
747    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
748    imul            uvd, 28
749    vpbroadcastw   xm11, [base+round_vals-12+shiftq*2]
750    sar          bdmaxd, 1
751    movq            xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
752    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma
753    movhps          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7]
754    pmovsxbw         m7, xm7
755%if %2
756    vpbroadcastw   xm14, [base+hmul_bits+2+%3*2]
757%endif
758    pshufd           m4, m7, q0000
759    pshufd           m5, m7, q1111
760    pshufd           m6, m7, q2222
761    pshufd           m7, m7, q3333
762    movd            xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14]
763    pinsrb          xm0, [base+pb_1], 3
764    pinsrd          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1
765    pinsrd          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2
766    pmovsxbw         m0, xm0
767    movd           xm12, bdmaxd                 ; max_grain
768    pshufd           m8, m0, q0000
769    pshufd           m9, m0, q1111
770    pcmpeqd        xm13, xm13
771    punpckhqdq     xm10, xm0, xm0
772    pxor           xm13, xm12                   ; min_grain
773    pinsrw         xm10, [base+round_vals-10+shiftq*2], 3
774    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
775%if %2
776    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
777%else
778    sub            bufq, 2*(82*69+3)
779%endif
780    add           bufyq, 2*(79+82*3)
781    mov              hd, 70-35*%3
782.y_loop_ar3:
783    mov              xq, -(76>>%2)
784.x_loop_ar3:
785    movu            xm2, [bufq+xq*2-82*6-6+ 0]    ; y=-3,x=[-3,+4]
786    vinserti128      m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4]
787    movq            xm1, [bufq+xq*2-82*6-6+16]    ; y=-3,x=[+5,+8]
788    vinserti128      m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12]
789    palignr          m3, m1, m2, 2                ; y=-3/-2,x=[-2,+5]
790    palignr          m1, m2, 12                   ; y=-3/-2,x=[+3,+6]
791    punpcklwd        m0, m2, m3                   ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
792    punpckhwd        m2, m3                       ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
793    shufps           m3, m0, m2, q1032            ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
794    pmaddwd          m0, m4
795    pmaddwd          m2, m6
796    pmaddwd          m3, m5
797    paddd            m0, m2
798    paddd            m0, m3
799    movu            xm2, [bufq+xq*2-82*2-6+ 0]    ; y=-1,x=[-3,+4]
800    vinserti128      m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8]
801%if %2
802    movu            xm3, [bufyq+xq*4]
803%if %3
804    paddw           xm3, [bufyq+xq*4+82*2]
805%endif
806    phaddw          xm3, xm3
807    pmulhrsw        xm3, xm14
808%else
809    movq            xm3, [bufyq+xq*2]
810%endif
811    punpcklwd        m1, m3
812    pmaddwd          m1, m7
813    paddd            m0, m1
814    psrldq           m1, m2, 4
815    psrldq           m3, m2, 6
816    vpblendd         m3, m11, 0x0f                ; rounding constant
817    punpcklwd        m1, m3                       ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
818    pmaddwd          m1, m9                       ;      x=[+2/+3,+3/+4,+4/+5,+5,+6]
819    psrldq           m3, m2, 2
820    punpcklwd        m2, m3                       ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
821    pmaddwd          m2, m8                       ;      x=[+0/+1,+1/+2,+2/+3,+3/+4]
822    paddd            m0, m1
823    movu            xm1, [bufq+xq*2-6]            ; y=0,x=[-3,+4]
824    paddd            m0, m2
825    vextracti128    xm2, m0, 1
826    paddd           xm0, xm2
827.x_loop_ar3_inner:
828    pmaddwd         xm2, xm1, xm10
829    pshuflw         xm3, xm2, q1032
830    paddd           xm2, xm0                      ; add top
831    paddd           xm2, xm3                      ; left+cur
832    psrldq          xm0, 4
833    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
834    psrldq          xm1, 2
835    ; no need to packssdw since we only care about one value
836    pminsd          xm2, xm12
837    pmaxsd          xm2, xm13
838    pextrw  [bufq+xq*2], xm2, 0
839    pslldq          xm2, 4
840    pblendw         xm1, xm2, 00000100b
841    inc              xq
842    jz .x_loop_ar3_end
843    test             xb, 3
844    jnz .x_loop_ar3_inner
845    jmp .x_loop_ar3
846.x_loop_ar3_end:
847    add            bufq, 82*2
848    add           bufyq, 82*2<<%3
849    dec              hd
850    jg .y_loop_ar3
851    RET
852%endmacro
853
854cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
855                                      grain_lut, unused, sby, see
856%define base r11-grain_min
857    lea             r11, [grain_min]
858    mov             r6d, r9m ; bdmax
859    mov             r9d, [fg_dataq+FGData.clip_to_restricted_range]
860    mov             r7d, [fg_dataq+FGData.scaling_shift]
861    mov            sbyd, sbym
862    vpbroadcastd     m8, r9m
863    shr             r6d, 11  ; is_12bpc
864    vpbroadcastd     m9, [base+grain_min+r6*4]
865    shlx           r10d, r9d, r6d
866    vpbroadcastd    m10, [base+grain_max+r6*4]
867    lea             r9d, [r6+r9*4]
868    vpbroadcastw    m11, [base+mul_bits+r7*2-12]
869    vpbroadcastd    m12, [base+fg_min+r10*4]
870    vpbroadcastd    m13, [base+fg_max+r9*4]
871    test           sbyd, sbyd
872    setnz           r7b
873    vpbroadcastd    m14, [base+pd_16]
874    test            r7b, [fg_dataq+FGData.overlap_flag]
875    jnz .vertical_overlap
876
877    imul           seed, sbyd, (173 << 24) | 37
878    add            seed, (105 << 24) | 178
879    rorx           seed, seed, 24
880    movzx          seed, seew
881    xor            seed, [fg_dataq+FGData.seed]
882
883    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
884                offx, offy, see, src_bak
885
886    lea        src_bakq, [srcq+wq*2]
887    neg              wq
888    sub            dstq, srcq
889
890.loop_x:
891    rorx             r6, seeq, 1
892    or             seed, 0xEFF4
893    test           seeb, seeh
894    lea            seed, [r6+0x8000]
895    cmovp          seed, r6d                ; updated seed
896    rorx          offyd, seed, 8
897    rorx          offxq, seeq, 12
898    and           offyd, 0xf
899    imul          offyd, 164
900    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
901
902    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
903                h, offxy, see, src_bak
904
905    mov      grain_lutq, grain_lutmp
906    mov              hd, hm
907.loop_y:
908    ; scaling[src]
909    mova             m0, [srcq+ 0]
910    mova             m1, [srcq+32]
911    pand             m4, m8, m0
912    psrld            m3, m0, 16
913    mova             m6, m9
914    vpgatherdd       m2, [scalingq+m4-0], m9
915    pand             m3, m8
916    mova             m9, m6
917    vpgatherdd       m4, [scalingq+m3-2], m6
918    pand             m5, m8, m1
919    mova             m6, m9
920    vpgatherdd       m3, [scalingq+m5-0], m9
921    pblendw          m4, m2, 0x55
922    psrld            m2, m1, 16
923    mova             m9, m6
924    pand             m2, m8
925    vpgatherdd       m5, [scalingq+m2-2], m6
926    pblendw          m5, m3, 0x55
927
928    ; noise = round2(scaling[src] * grain, scaling_shift)
929    pmaddubsw        m4, m11
930    pmaddubsw        m5, m11
931    paddw            m4, m4
932    paddw            m5, m5
933    pmulhrsw         m4, [grain_lutq+offxyq*2]
934    pmulhrsw         m5, [grain_lutq+offxyq*2+32]
935
936    ; dst = clip_pixel(src, noise)
937    paddw            m0, m4
938    paddw            m1, m5
939    pmaxsw           m0, m12
940    pmaxsw           m1, m12
941    pminsw           m0, m13
942    pminsw           m1, m13
943    mova [dstq+srcq+ 0], m0
944    mova [dstq+srcq+32], m1
945
946    add            srcq, strideq
947    add      grain_lutq, 82*2
948    dec              hd
949    jg .loop_y
950    add              wq, 32
951    jge .end
952    lea            srcq, [src_bakq+wq*2]
953    cmp byte [fg_dataq+FGData.overlap_flag], 0
954    je .loop_x
955    movq            xm7, [pw_27_17_17_27]
956    cmp       dword r8m, 0 ; sby
957    jne .loop_x_hv_overlap
958
959    ; horizontal overlap (without vertical overlap)
960.loop_x_h_overlap:
961    rorx             r6, seeq, 1
962    or             seed, 0xEFF4
963    test           seeb, seeh
964    lea            seed, [r6+0x8000]
965    cmovp          seed, r6d                ; updated seed
966
967    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
968                offx, offy, see, src_bak, left_offxy
969
970    lea     left_offxyd, [offyq+32]         ; previous column's offy*stride+offx
971    rorx          offyd, seed, 8
972    rorx          offxq, seeq, 12
973    and           offyd, 0xf
974    imul          offyd, 164
975    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
976
977    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
978                h, offxy, see, src_bak, left_offxy
979
980    mov      grain_lutq, grain_lutmp
981    mov              hd, hm
982.loop_y_h_overlap:
983    ; scaling[src]
984    mova             m0, [srcq+ 0]
985    mova             m1, [srcq+32]
986    pand             m4, m8, m0
987    psrld            m3, m0, 16
988    mova             m6, m9
989    vpgatherdd       m2, [scalingq+m4-0], m9
990    pand             m3, m8
991    mova             m9, m6
992    vpgatherdd       m4, [scalingq+m3-2], m6
993    pand             m5, m8, m1
994    mova             m6, m9
995    vpgatherdd       m3, [scalingq+m5-0], m9
996    pblendw          m4, m2, 0x55
997    psrld            m2, m1, 16
998    mova             m9, m6
999    pand             m2, m8
1000    vpgatherdd       m5, [scalingq+m2-2], m6
1001    pblendw          m5, m3, 0x55
1002
1003    ; grain = grain_lut[offy+y][offx+x]
1004    movu             m3, [grain_lutq+offxyq*2]
1005    movd            xm6, [grain_lutq+left_offxyq*2]
1006    punpcklwd       xm6, xm3
1007    pmaddwd         xm6, xm7
1008    paddd           xm6, xm14
1009    psrad           xm6, 5
1010    packssdw        xm6, xm6
1011    pmaxsw          xm6, xm9
1012    pminsw          xm6, xm10
1013    vpblendd         m3, m6, 0x01
1014
1015    ; noise = round2(scaling[src] * grain, scaling_shift)
1016    pmaddubsw        m4, m11
1017    pmaddubsw        m5, m11
1018    paddw            m4, m4
1019    paddw            m5, m5
1020    pmulhrsw         m4, m3
1021    pmulhrsw         m5, [grain_lutq+offxyq*2+32]
1022
1023    ; dst = clip_pixel(src, noise)
1024    paddw            m0, m4
1025    paddw            m1, m5
1026    pmaxsw           m0, m12
1027    pmaxsw           m1, m12
1028    pminsw           m0, m13
1029    pminsw           m1, m13
1030    mova [dstq+srcq+ 0], m0
1031    mova [dstq+srcq+32], m1
1032
1033    add            srcq, strideq
1034    add      grain_lutq, 82*2
1035    dec              hd
1036    jg .loop_y_h_overlap
1037    add              wq, 32
1038    jge .end
1039    lea            srcq, [src_bakq+wq*2]
1040    cmp       dword r8m, 0 ; sby
1041    jne .loop_x_hv_overlap
1042    jmp .loop_x_h_overlap
1043
1044.vertical_overlap:
1045    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
1046                sby, see, src_bak
1047
1048    movzx          sbyd, sbyb
1049    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1050    imul            r7d, sbyd, 173 * 0x00010001
1051    imul           sbyd, 37 * 0x01000100
1052    add             r7d, (105 << 16) | 188
1053    add            sbyd, (178 << 24) | (141 << 8)
1054    and             r7d, 0x00ff00ff
1055    and            sbyd, 0xff00ff00
1056    xor            seed, r7d
1057    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1058
1059    lea        src_bakq, [srcq+wq*2]
1060    neg              wq
1061    sub            dstq, srcq
1062
1063.loop_x_v_overlap:
1064    vpbroadcastd    m15, [pw_27_17_17_27]
1065
1066    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1067    mov             r6d, seed
1068    or             seed, 0xeff4eff4
1069    test           seeb, seeh
1070    setp            r7b                     ; parity of top_seed
1071    shr            seed, 16
1072    shl             r7d, 16
1073    test           seeb, seeh
1074    setp            r7b                     ; parity of cur_seed
1075    or              r6d, 0x00010001
1076    xor             r7d, r6d
1077    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1078
1079    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1080                offx, offy, see, src_bak, unused, top_offxy
1081
1082    rorx          offyd, seed, 8
1083    rorx          offxd, seed, 12
1084    and           offyd, 0xf000f
1085    and           offxd, 0xf000f
1086    imul          offyd, 164
1087    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1088    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
1089
1090    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1091                h, offxy, see, src_bak, unused, top_offxy
1092
1093    mov      grain_lutq, grain_lutmp
1094    mov              hd, hm
1095    movzx    top_offxyd, offxyw
1096    shr          offxyd, 16
1097.loop_y_v_overlap:
1098    ; scaling[src]
1099    mova             m0, [srcq+ 0]
1100    mova             m1, [srcq+32]
1101    pand             m4, m8, m0
1102    psrld            m3, m0, 16
1103    mova             m6, m9
1104    vpgatherdd       m2, [scalingq+m4-0], m9
1105    pand             m3, m8
1106    mova             m9, m6
1107    vpgatherdd       m4, [scalingq+m3-2], m6
1108    pand             m5, m8, m1
1109    mova             m6, m9
1110    vpgatherdd       m3, [scalingq+m5-0], m9
1111    pblendw          m2, m4, 0xaa
1112    psrld            m4, m1, 16
1113    mova             m9, m6
1114    pand             m4, m8
1115    vpgatherdd       m5, [scalingq+m4-2], m6
1116    pblendw          m3, m5, 0xaa
1117
1118    ; grain = grain_lut[offy+y][offx+x]
1119    movu             m6, [grain_lutq+offxyq*2]
1120    movu             m5, [grain_lutq+top_offxyq*2]
1121    punpcklwd        m4, m5, m6
1122    punpckhwd        m5, m6
1123    pmaddwd          m4, m15
1124    pmaddwd          m5, m15
1125    movu             m7, [grain_lutq+offxyq*2+32]
1126    movu             m6, [grain_lutq+top_offxyq*2+32]
1127    paddd            m4, m14
1128    paddd            m5, m14
1129    psrad            m4, 5
1130    psrad            m5, 5
1131    packssdw         m4, m5
1132    punpcklwd        m5, m6, m7
1133    punpckhwd        m6, m7
1134    pmaddwd          m5, m15
1135    pmaddwd          m6, m15
1136    paddd            m5, m14
1137    paddd            m6, m14
1138    psrad            m5, 5
1139    psrad            m6, 5
1140    packssdw         m5, m6
1141    pmaxsw           m4, m9
1142    pmaxsw           m5, m9
1143    pminsw           m4, m10
1144    pminsw           m5, m10
1145
1146    ; noise = round2(scaling[src] * grain, scaling_shift)
1147    pmaddubsw        m2, m11
1148    pmaddubsw        m3, m11
1149    paddw            m2, m2
1150    paddw            m3, m3
1151    pmulhrsw         m4, m2
1152    pmulhrsw         m5, m3
1153
1154    ; dst = clip_pixel(src, noise)
1155    paddw            m0, m4
1156    paddw            m1, m5
1157    pmaxsw           m0, m12
1158    pmaxsw           m1, m12
1159    pminsw           m0, m13
1160    pminsw           m1, m13
1161    mova [dstq+srcq+ 0], m0
1162    mova [dstq+srcq+32], m1
1163
1164    add            srcq, strideq
1165    add      grain_lutq, 82*2
1166    dec              hb
1167    jz .end_y_v_overlap
1168    vpbroadcastd    m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
1169    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1170    ; remaining (up to) 30 lines
1171    add              hd, 0x80000000
1172    jnc .loop_y_v_overlap
1173    jmp .loop_y
1174.end_y_v_overlap:
1175    add              wq, 32
1176    jge .end
1177    lea            srcq, [src_bakq+wq*2]
1178
1179    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1180    ; back to .loop_x_v_overlap, and instead always fall-through to
1181    ; h+v overlap
1182
1183.loop_x_hv_overlap:
1184    vpbroadcastd    m15, [pw_27_17_17_27]
1185
1186    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1187    mov             r6d, seed
1188    or             seed, 0xeff4eff4
1189    test           seeb, seeh
1190    setp            r7b                     ; parity of top_seed
1191    shr            seed, 16
1192    shl             r7d, 16
1193    test           seeb, seeh
1194    setp            r7b                     ; parity of cur_seed
1195    or              r6d, 0x00010001
1196    xor             r7d, r6d
1197    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1198
1199    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1200                offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
1201
1202    lea  topleft_offxyd, [top_offxyq+32]
1203    lea     left_offxyd, [offyq+32]
1204    rorx          offyd, seed, 8
1205    rorx          offxd, seed, 12
1206    and           offyd, 0xf000f
1207    and           offxd, 0xf000f
1208    imul          offyd, 164
1209    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1210    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
1211
1212    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1213                h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
1214
1215    mov      grain_lutq, grain_lutmp
1216    mov              hd, hm
1217    movzx    top_offxyd, offxyw
1218    shr          offxyd, 16
1219.loop_y_hv_overlap:
1220    ; scaling[src]
1221    mova             m0, [srcq+ 0]
1222    mova             m1, [srcq+32]
1223    pand             m4, m8, m0
1224    psrld            m3, m0, 16
1225    mova             m6, m9
1226    vpgatherdd       m2, [scalingq+m4-0], m9
1227    pand             m3, m8
1228    mova             m9, m6
1229    vpgatherdd       m4, [scalingq+m3-2], m6
1230    pand             m5, m8, m1
1231    mova             m6, m9
1232    vpgatherdd       m3, [scalingq+m5-0], m9
1233    pblendw          m2, m4, 0xaa
1234    psrld            m4, m1, 16
1235    mova             m9, m6
1236    pand             m4, m8
1237    vpgatherdd       m5, [scalingq+m4-2], m6
1238    pblendw          m3, m5, 0xaa
1239
1240    ; grain = grain_lut[offy+y][offx+x]
1241    movu             m7, [grain_lutq+offxyq*2]
1242    movd            xm6, [grain_lutq+left_offxyq*2]
1243    movu             m5, [grain_lutq+top_offxyq*2]
1244    movd            xm4, [grain_lutq+topleft_offxyq*2]
1245    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1246    punpcklwd       xm6, xm7
1247    punpcklwd       xm4, xm5
1248    punpcklqdq      xm6, xm4
1249    movddup         xm4, [pw_27_17_17_27]
1250    pmaddwd         xm6, xm4
1251    paddd           xm6, xm14
1252    psrad           xm6, 5
1253    packssdw        xm6, xm6
1254    pmaxsw          xm6, xm9
1255    pminsw          xm6, xm10
1256    pshuflw         xm4, xm6, q1032
1257    vpblendd         m6, m7, 0xfe
1258    vpblendd         m4, m5, 0xfe
1259    ; followed by v interpolation (top | cur -> cur)
1260    punpckhwd        m5, m7
1261    pmaddwd          m5, m15
1262    punpcklwd        m4, m6
1263    pmaddwd          m4, m15
1264    movu             m7, [grain_lutq+offxyq*2+32]
1265    movu             m6, [grain_lutq+top_offxyq*2+32]
1266    paddd            m5, m14
1267    paddd            m4, m14
1268    psrad            m5, 5
1269    psrad            m4, 5
1270    packssdw         m4, m5
1271    punpcklwd        m5, m6, m7
1272    punpckhwd        m6, m7
1273    pmaddwd          m5, m15
1274    pmaddwd          m6, m15
1275    paddd            m5, m14
1276    paddd            m6, m14
1277    psrad            m5, 5
1278    psrad            m6, 5
1279    packssdw         m5, m6
1280    pmaxsw           m4, m9
1281    pmaxsw           m5, m9
1282    pminsw           m4, m10
1283    pminsw           m5, m10
1284
1285    ; noise = round2(scaling[src] * grain, scaling_shift)
1286    pmaddubsw        m2, m11
1287    pmaddubsw        m3, m11
1288    paddw            m2, m2
1289    paddw            m3, m3
1290    pmulhrsw         m4, m2
1291    pmulhrsw         m5, m3
1292
1293    ; dst = clip_pixel(src, noise)
1294    paddw            m0, m4
1295    paddw            m1, m5
1296    pmaxsw           m0, m12
1297    pmaxsw           m1, m12
1298    pminsw           m0, m13
1299    pminsw           m1, m13
1300    mova [dstq+srcq+ 0], m0
1301    mova [dstq+srcq+32], m1
1302
1303    add            srcq, strideq
1304    add      grain_lutq, 82*2
1305    dec              hb
1306    jz .end_y_hv_overlap
1307    vpbroadcastd    m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
1308    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1309    ; remaining (up to) 30 lines
1310    add              hd, 0x80000000
1311    jnc .loop_y_hv_overlap
1312    movq            xm7, [pw_27_17_17_27]
1313    jmp .loop_y_h_overlap
1314.end_y_hv_overlap:
1315    add              wq, 32
1316    lea            srcq, [src_bakq+wq*2]
1317    jl .loop_x_hv_overlap
1318.end:
1319    RET
1320
1321%macro FGUV_FN 3 ; name, ss_hor, ss_ver
1322cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
1323                                           grain_lut, h, sby, luma, lstride, uv_pl, is_id
1324%define base r12-grain_min
1325    lea             r12, [grain_min]
1326    mov             r9d, r13m               ; bdmax
1327    mov             r7d, [fg_dataq+FGData.scaling_shift]
1328    mov            r11d, is_idm
1329    mov            sbyd, sbym
1330    vpbroadcastw    m11, [base+mul_bits+r7*2-12]
1331    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1332    shr             r9d, 11                 ; is_12bpc
1333    vpbroadcastd     m8, [base+grain_min+r9*4]
1334    shlx           r10d, r6d, r9d
1335    vpbroadcastd     m9, [base+grain_max+r9*4]
1336    vpbroadcastw    m10, r13m
1337    shlx            r6d, r6d, r11d
1338    vpbroadcastd    m12, [base+fg_min+r10*4]
1339    lea             r6d, [r9+r6*2]
1340    vpbroadcastd    m13, [base+fg_max+r6*4]
1341    test           sbyd, sbyd
1342    setnz           r7b
1343    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
1344    jne .csfl
1345
1346%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
1347    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1348                unused, sby, see, overlap
1349
1350%if %1
1351    mov             r6d, r11m
1352    vpbroadcastd     m0, [base+pb_8_9_0_1]
1353    vpbroadcastd     m1, [base+uv_offset_mul+r9*4]
1354    vbroadcasti128  m14, [fg_dataq+FGData.uv_mult+r6*4]
1355    vpbroadcastd    m15, [fg_dataq+FGData.uv_offset+r6*4]
1356    pshufb          m14, m0 ; { uv_luma_mult, uv_mult }
1357    pmaddwd         m15, m1
1358%else
1359%if %2
1360    vpbroadcastq    m15, [base+pw_23_22]
1361%else
1362    vpbroadcastq    m15, [base+pw_27_17_17_27]
1363%endif
1364    vpbroadcastd    m14, [base+pd_16]
1365%endif
1366    test            r7b, [fg_dataq+FGData.overlap_flag]
1367    jnz %%vertical_overlap
1368
1369    imul           seed, sbyd, (173 << 24) | 37
1370    add            seed, (105 << 24) | 178
1371    rorx           seed, seed, 24
1372    movzx          seed, seew
1373    xor            seed, [fg_dataq+FGData.seed]
1374
1375    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1376                unused2, unused3, see, unused4, unused5, unused6, luma, lstride
1377
1378    mov           lumaq, r9mp
1379    mov        lstrideq, r10mp
1380    lea             r10, [srcq+wq*2]
1381    lea             r11, [dstq+wq*2]
1382    lea             r12, [lumaq+wq*(2<<%2)]
1383    mov            r9mp, r10
1384    mov           r11mp, r11
1385    mov           r12mp, r12
1386    neg              wq
1387
1388%%loop_x:
1389    rorx             r6, seeq, 1
1390    or             seed, 0xEFF4
1391    test           seeb, seeh
1392    lea            seed, [r6+0x8000]
1393    cmovp          seed, r6d               ; updated seed
1394
1395    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1396                offx, offy, see, unused1, unused2, unused3, luma, lstride
1397
1398    rorx          offyd, seed, 8
1399    rorx          offxq, seeq, 12
1400    and           offyd, 0xf
1401    imul          offyd, 164>>%3
1402    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
1403
1404    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1405                h, offxy, see, unused1, unused2, unused3, luma, lstride
1406
1407    mov      grain_lutq, grain_lutmp
1408    mov              hd, hm
1409%%loop_y:
1410    ; luma_src
1411%if %2
1412    mova            xm2, [lumaq+lstrideq*0+ 0]
1413    vinserti128      m2, [lumaq+lstrideq*0+32], 1
1414    mova            xm4, [lumaq+lstrideq*0+16]
1415    vinserti128      m4, [lumaq+lstrideq*0+48], 1
1416    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
1417    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
1418    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
1419    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
1420    phaddw           m2, m4
1421    phaddw           m3, m5
1422    pxor             m4, m4
1423    pavgw            m2, m4
1424    pavgw            m3, m4
1425%elif %1
1426    mova             m2, [lumaq+ 0]
1427    mova             m3, [lumaq+32]
1428%endif
1429%if %1
1430    mova             m0, [srcq]
1431%if %2
1432    mova             m1, [srcq+strideq]
1433%else
1434    mova             m1, [srcq+32]
1435%endif
1436    punpckhwd        m4, m2, m0
1437    punpcklwd        m2, m0
1438    punpckhwd        m5, m3, m1
1439    punpcklwd        m3, m1                 ; { luma, chroma }
1440    REPX {pmaddwd x, m14}, m4, m2, m5, m3
1441    REPX {paddd   x, m15}, m4, m2, m5, m3
1442    REPX {psrad   x, 6  }, m4, m2, m5, m3
1443    packusdw         m2, m4
1444    packusdw         m3, m5
1445    pminuw           m2, m10
1446    pminuw           m3, m10                ; clip_pixel()
1447%elif %2
1448    pand             m2, m10
1449    pand             m3, m10
1450%else
1451    pand             m2, m10, [lumaq+ 0]
1452    pand             m3, m10, [lumaq+32]
1453%endif
1454
1455    ; scaling[luma_src]
1456    vpbroadcastd     m7, [pd_m65536]
1457    pandn            m4, m7, m2
1458    mova             m6, m7
1459    vpgatherdd       m5, [scalingq+m4-0], m7
1460    psrld            m2, 16
1461    mova             m7, m6
1462    vpgatherdd       m4, [scalingq+m2-2], m6
1463    pblendw          m4, m5, 0x55
1464    pandn            m5, m7, m3
1465    mova             m6, m7
1466    vpgatherdd       m2, [scalingq+m5-0], m7
1467    psrld            m3, 16
1468    vpgatherdd       m5, [scalingq+m3-2], m6
1469    pblendw          m5, m2, 0x55
1470
1471    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1472    pmaddubsw        m4, m11
1473    pmaddubsw        m5, m11
1474    paddw            m4, m4
1475    paddw            m5, m5
1476    pmulhrsw         m4, [grain_lutq+offxyq*2]
1477%if %2
1478    pmulhrsw         m5, [grain_lutq+offxyq*2+82*2]
1479%else
1480    pmulhrsw         m5, [grain_lutq+offxyq*2+32]
1481%endif
1482
1483    ; dst = clip_pixel(src, noise)
1484%if %1
1485    paddw            m0, m4
1486    paddw            m1, m5
1487%else
1488    paddw            m0, m4, [srcq]
1489%if %2
1490    paddw            m1, m5, [srcq+strideq]
1491%else
1492    paddw            m1, m5, [srcq+32]
1493%endif
1494%endif
1495    pmaxsw           m0, m12
1496    pmaxsw           m1, m12
1497    pminsw           m0, m13
1498    pminsw           m1, m13
1499    mova         [dstq], m0
1500%if %2
1501    mova [dstq+strideq], m1
1502    lea            srcq, [srcq+strideq*2]
1503    lea            dstq, [dstq+strideq*2]
1504    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1505%else
1506    mova      [dstq+32], m1
1507    add            srcq, strideq
1508    add            dstq, strideq
1509    add           lumaq, lstrideq
1510%endif
1511    add      grain_lutq, 82*(2<<%2)
1512%if %2
1513    sub              hb, 2
1514%else
1515    dec              hb
1516%endif
1517    jg %%loop_y
1518    add              wq, 32>>%2
1519    jge .end
1520    mov            srcq, r9mp
1521    mov            dstq, r11mp
1522    mov           lumaq, r12mp
1523    lea            srcq, [srcq+wq*2]
1524    lea            dstq, [dstq+wq*2]
1525    lea           lumaq, [lumaq+wq*(2<<%2)]
1526    cmp byte [fg_dataq+FGData.overlap_flag], 0
1527    je %%loop_x
1528    cmp       dword r8m, 0 ; sby
1529    jne %%loop_x_hv_overlap
1530
1531    ; horizontal overlap (without vertical overlap)
1532%%loop_x_h_overlap:
1533    rorx             r6, seeq, 1
1534    or             seed, 0xEFF4
1535    test           seeb, seeh
1536    lea            seed, [r6+0x8000]
1537    cmovp          seed, r6d               ; updated seed
1538
1539    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1540                offx, offy, see, left_offxy, unused1, unused2, luma, lstride
1541
1542    lea     left_offxyd, [offyq+(32>>%2)]         ; previous column's offy*stride+offx
1543    rorx          offyd, seed, 8
1544    rorx          offxq, seeq, 12
1545    and           offyd, 0xf
1546    imul          offyd, 164>>%3
1547    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
1548
1549    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1550                h, offxy, see, left_offxy, unused1, unused2, luma, lstride
1551
1552    mov      grain_lutq, grain_lutmp
1553    mov              hd, hm
1554%%loop_y_h_overlap:
1555    ; luma_src
1556%if %2
1557    mova            xm2, [lumaq+lstrideq*0+ 0]
1558    vinserti128      m2, [lumaq+lstrideq*0+32], 1
1559    mova            xm4, [lumaq+lstrideq*0+16]
1560    vinserti128      m4, [lumaq+lstrideq*0+48], 1
1561    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
1562    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
1563    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
1564    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
1565    phaddw           m2, m4
1566    phaddw           m3, m5
1567    pxor             m4, m4
1568    pavgw            m2, m4
1569    pavgw            m3, m4
1570%elif %1
1571    mova             m2, [lumaq]
1572    mova             m3, [lumaq+32]
1573%endif
1574%if %1
1575    mova             m0, [srcq]
1576%if %2
1577    mova             m1, [srcq+strideq]
1578%else
1579    mova             m1, [srcq+32]
1580%endif
1581    punpckhwd        m4, m2, m0
1582    punpcklwd        m2, m0
1583    punpckhwd        m5, m3, m1
1584    punpcklwd        m3, m1                 ; { luma, chroma }
1585    REPX {pmaddwd x, m14}, m4, m2, m5, m3
1586    REPX {paddd   x, m15}, m4, m2, m5, m3
1587    REPX {psrad   x, 6  }, m4, m2, m5, m3
1588    packusdw         m2, m4
1589    packusdw         m3, m5
1590    pminuw           m2, m10                ; clip_pixel()
1591    pminuw           m3, m10
1592%elif %2
1593    pand             m2, m10
1594    pand             m3, m10
1595%else
1596    pand             m2, m10, [lumaq+ 0]
1597    pand             m3, m10, [lumaq+32]
1598%endif
1599
1600    ; scaling[luma_src]
1601    vpbroadcastd     m7, [pd_m65536]
1602    pandn            m4, m7, m2
1603    mova             m6, m7
1604    vpgatherdd       m5, [scalingq+m4-0], m7
1605    psrld            m2, 16
1606    mova             m7, m6
1607    vpgatherdd       m4, [scalingq+m2-2], m6
1608    pblendw          m4, m5, 0x55
1609    pandn            m5, m7, m3
1610    mova             m6, m7
1611    vpgatherdd       m2, [scalingq+m5-0], m7
1612    psrld            m3, 16
1613    vpgatherdd       m5, [scalingq+m3-2], m6
1614    pblendw          m5, m2, 0x55
1615
1616    ; grain = grain_lut[offy+y][offx+x]
1617    movu             m2, [grain_lutq+offxyq*2]
1618%if %2
1619    movu             m3, [grain_lutq+offxyq*2+82*2]
1620%else
1621    movu             m3, [grain_lutq+offxyq*2+32]
1622%endif
1623    movd            xm6, [grain_lutq+left_offxyq*2]
1624%if %2
1625    pinsrw          xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1}
1626    punpckldq       xm7, xm2, xm3           ; {cur0, cur1}
1627    punpcklwd       xm6, xm7                ; {left0, cur0, left1, cur1}
1628%else
1629    punpcklwd       xm6, xm2
1630%endif
1631%if %1
1632%if %2
1633    vpbroadcastq    xm7, [pw_23_22]
1634%else
1635    movq            xm7, [pw_27_17_17_27]
1636%endif
1637    pmaddwd         xm6, xm7
1638    vpbroadcastd    xm7, [pd_16]
1639    paddd           xm6, xm7
1640%else
1641    pmaddwd         xm6, xm15
1642    paddd           xm6, xm14
1643%endif
1644    psrad           xm6, 5
1645    packssdw        xm6, xm6
1646    pmaxsw          xm6, xm8
1647    pminsw          xm6, xm9
1648    vpblendd         m2, m6, 0x01
1649%if %2
1650    pshuflw         xm6, xm6, q1032
1651    vpblendd         m3, m6, 0x01
1652%endif
1653
1654    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1655    pmaddubsw        m4, m11
1656    pmaddubsw        m5, m11
1657    paddw            m4, m4
1658    paddw            m5, m5
1659    pmulhrsw         m2, m4
1660    pmulhrsw         m3, m5
1661
1662    ; dst = clip_pixel(src, noise)
1663%if %1
1664    paddw            m0, m2
1665    paddw            m1, m3
1666%else
1667    paddw            m0, m2, [srcq]
1668%if %2
1669    paddw            m1, m3, [srcq+strideq]
1670%else
1671    paddw            m1, m3, [srcq+32]
1672%endif
1673%endif
1674    pmaxsw           m0, m12
1675    pmaxsw           m1, m12
1676    pminsw           m0, m13
1677    pminsw           m1, m13
1678    mova         [dstq], m0
1679%if %2
1680    mova [dstq+strideq], m1
1681    lea            srcq, [srcq+strideq*2]
1682    lea            dstq, [dstq+strideq*2]
1683    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1684%else
1685    mova      [dstq+32], m1
1686    add            srcq, strideq
1687    add            dstq, strideq
1688    add           lumaq, r10mp
1689%endif
1690    add      grain_lutq, 82*(2<<%2)
1691%if %2
1692    sub              hb, 2
1693%else
1694    dec              hb
1695%endif
1696    jg %%loop_y_h_overlap
1697    add              wq, 32>>%2
1698    jge .end
1699    mov            srcq, r9mp
1700    mov            dstq, r11mp
1701    mov           lumaq, r12mp
1702    lea            srcq, [srcq+wq*2]
1703    lea            dstq, [dstq+wq*2]
1704    lea           lumaq, [lumaq+wq*(2<<%2)]
1705    cmp       dword r8m, 0 ; sby
1706    jne %%loop_x_hv_overlap
1707    jmp %%loop_x_h_overlap
1708
1709%%vertical_overlap:
1710    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
1711                sby, see, unused1, unused2, unused3, lstride
1712
1713    movzx          sbyd, sbyb
1714    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1715    imul            r7d, sbyd, 173 * 0x00010001
1716    imul           sbyd, 37 * 0x01000100
1717    add             r7d, (105 << 16) | 188
1718    add            sbyd, (178 << 24) | (141 << 8)
1719    and             r7d, 0x00ff00ff
1720    and            sbyd, 0xff00ff00
1721    xor            seed, r7d
1722    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1723
1724    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1725                offx, offy, see, unused1, top_offxy, unused2, luma, lstride
1726
1727    mov           lumaq, r9mp
1728    mov        lstrideq, r10mp
1729    lea             r10, [srcq+wq*2]
1730    lea             r11, [dstq+wq*2]
1731    lea             r12, [lumaq+wq*(2<<%2)]
1732    mov            r9mp, r10
1733    mov           r11mp, r11
1734    mov           r12mp, r12
1735    neg              wq
1736
1737%%loop_x_v_overlap:
1738    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1739    mov             r6d, seed
1740    or             seed, 0xeff4eff4
1741    test           seeb, seeh
1742    setp            r7b                     ; parity of top_seed
1743    shr            seed, 16
1744    shl             r7d, 16
1745    test           seeb, seeh
1746    setp            r7b                     ; parity of cur_seed
1747    or              r6d, 0x00010001
1748    xor             r7d, r6d
1749    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1750
1751    rorx          offyd, seed, 8
1752    rorx          offxd, seed, 12
1753    and           offyd, 0xf000f
1754    and           offxd, 0xf000f
1755    imul          offyd, 164>>%3
1756    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1757    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1758
1759    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1760                h, offxy, see, unused1, top_offxy, unused2, luma, lstride
1761
1762    mov      grain_lutq, grain_lutmp
1763    mov              hd, hm
1764    movzx    top_offxyd, offxyw
1765    shr          offxyd, 16
1766%if %2 == 0
1767    lea             r10, [pw_27_17_17_27]
1768%endif
1769%%loop_y_v_overlap:
1770    ; luma_src
1771%if %2
1772    mova            xm2, [lumaq+lstrideq*0+ 0]
1773    vinserti128      m2, [lumaq+lstrideq*0+32], 1
1774    mova            xm4, [lumaq+lstrideq*0+16]
1775    vinserti128      m4, [lumaq+lstrideq*0+48], 1
1776    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
1777    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
1778    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
1779    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
1780    phaddw           m2, m4
1781    phaddw           m3, m5
1782    pxor             m4, m4
1783    pavgw            m2, m4
1784    pavgw            m3, m4
1785%elif %1
1786    mova             m2, [lumaq]
1787    mova             m3, [lumaq+32]
1788%endif
1789%if %1
1790    mova             m0, [srcq]
1791%if %2
1792    mova             m1, [srcq+strideq]
1793%else
1794    mova             m1, [srcq+32]
1795%endif
1796    punpckhwd        m4, m2, m0
1797    punpcklwd        m2, m0
1798    punpckhwd        m5, m3, m1
1799    punpcklwd        m3, m1                 ; { luma, chroma }
1800    REPX {pmaddwd x, m14}, m4, m2, m5, m3
1801    REPX {paddd   x, m15}, m4, m2, m5, m3
1802    REPX {psrad   x, 6  }, m4, m2, m5, m3
1803    packusdw         m2, m4
1804    packusdw         m3, m5
1805    pminuw           m2, m10                ; clip_pixel()
1806    pminuw           m3, m10
1807%elif %2
1808    pand             m2, m10
1809    pand             m3, m10
1810%else
1811    pand             m2, m10, [lumaq+ 0]
1812    pand             m3, m10, [lumaq+32]
1813%endif
1814
1815    ; scaling[luma_src]
1816    vpbroadcastd     m7, [pd_m65536]
1817    pandn            m4, m7, m2
1818    mova             m6, m7
1819    vpgatherdd       m5, [scalingq+m4-0], m7
1820    psrld            m2, 16
1821    mova             m7, m6
1822    vpgatherdd       m4, [scalingq+m2-2], m6
1823    pblendw          m4, m5, 0x55
1824    pandn            m5, m7, m3
1825    mova             m6, m7
1826    vpgatherdd       m2, [scalingq+m5-0], m7
1827    psrld            m3, 16
1828    vpgatherdd       m5, [scalingq+m3-2], m6
1829    pblendw          m5, m2, 0x55
1830
1831    ; grain = grain_lut[offy+y][offx+x]
1832    movu             m6, [grain_lutq+offxyq*2]
1833    movu             m3, [grain_lutq+top_offxyq*2]
1834    punpcklwd        m2, m3, m6
1835    punpckhwd        m3, m6                 ; { top, cur }
1836%if %3
1837    vpbroadcastd     m0, [pw_23_22]
1838%elif %2
1839    vpbroadcastd     m0, [pw_27_17_17_27]
1840%else
1841    vpbroadcastd     m0, [r10]
1842%endif
1843    REPX {pmaddwd x, m0}, m2, m3
1844%if %1
1845    vpbroadcastd     m1, [pd_16]
1846    REPX  {paddd x, m1}, m2, m3
1847%else
1848    REPX {paddd x, m14}, m2, m3
1849%endif
1850    REPX   {psrad x, 5}, m2, m3
1851    packssdw         m2, m3
1852%if %2
1853    movu             m3, [grain_lutq+offxyq*2+82*2]
1854%else
1855    movu             m3, [grain_lutq+offxyq*2+32]
1856%endif
1857%if %3
1858    pmaxsw           m2, m8
1859    pminsw           m2, m9
1860%else
1861%if %2
1862    movu             m7, [grain_lutq+top_offxyq*2+82*2]
1863    punpckhwd        m6, m3, m7             ; { cur, top }
1864    punpcklwd        m3, m7
1865%else
1866    movu             m7, [grain_lutq+top_offxyq*2+32]
1867    punpckhwd        m6, m7, m3
1868    punpcklwd        m3, m7, m3             ; { top, cur }
1869%endif
1870    pmaddwd          m6, m0
1871    pmaddwd          m3, m0
1872%if %1
1873    paddd            m6, m1
1874    paddd            m3, m1
1875%else
1876    paddd            m6, m14
1877    paddd            m3, m14
1878%endif
1879    psrad            m6, 5
1880    psrad            m3, 5
1881    packssdw         m3, m6
1882    pmaxsw           m2, m8
1883    pmaxsw           m3, m8
1884    pminsw           m2, m9
1885    pminsw           m3, m9
1886%endif
1887
1888    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1889    pmaddubsw        m4, m11
1890    pmaddubsw        m5, m11
1891    paddw            m4, m4
1892    paddw            m5, m5
1893    pmulhrsw         m2, m4
1894    pmulhrsw         m3, m5
1895
1896    ; dst = clip_pixel(src, noise)
1897    paddw            m0, m2, [srcq]
1898%if %2
1899    paddw            m1, m3, [srcq+strideq]
1900%else
1901    paddw            m1, m3, [srcq+32]
1902%endif
1903    pmaxsw           m0, m12
1904    pmaxsw           m1, m12
1905    pminsw           m0, m13
1906    pminsw           m1, m13
1907    mova         [dstq], m0
1908%if %2
1909    mova [dstq+strideq], m1
1910    sub              hb, 2
1911%else
1912    mova      [dstq+32], m1
1913    dec              hb
1914%endif
1915    jle %%end_y_v_overlap
1916%if %2
1917    lea            srcq, [srcq+strideq*2]
1918    lea            dstq, [dstq+strideq*2]
1919    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1920%else
1921    add            srcq, strideq
1922    add            dstq, strideq
1923    add           lumaq, lstrideq
1924%endif
1925    add      grain_lutq, 82*(2<<%2)
1926%if %2
1927    jmp %%loop_y
1928%else
1929    add              hd, 0x80000000
1930    jc %%loop_y
1931    add             r10, 4
1932    jmp %%loop_y_v_overlap
1933%endif
1934%%end_y_v_overlap:
1935    add              wq, 32>>%2
1936    jge .end
1937    mov            srcq, r9mp
1938    mov            dstq, r11mp
1939    mov           lumaq, r12mp
1940    lea            srcq, [srcq+wq*2]
1941    lea            dstq, [dstq+wq*2]
1942    lea           lumaq, [lumaq+wq*(2<<%2)]
1943
1944    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1945    ; back to .loop_x_v_overlap, and instead always fall-through to
1946    ; h+v overlap
1947%%loop_x_hv_overlap:
1948    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1949    mov             r6d, seed
1950    or             seed, 0xeff4eff4
1951    test           seeb, seeh
1952    setp            r7b                     ; parity of top_seed
1953    shr            seed, 16
1954    shl             r7d, 16
1955    test           seeb, seeh
1956    setp            r7b                     ; parity of cur_seed
1957    or              r6d, 0x00010001
1958    xor             r7d, r6d
1959    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1960
1961    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1962                offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
1963
1964%if %2 == 0
1965    lea             r14, [pw_27_17_17_27]
1966%endif
1967    lea  topleft_offxyq, [top_offxyq+(32>>%2)]
1968    lea     left_offxyq, [offyq+(32>>%2)]
1969    rorx          offyd, seed, 8
1970    rorx          offxd, seed, 12
1971    and           offyd, 0xf000f
1972    and           offxd, 0xf000f
1973    imul          offyd, 164>>%3
1974    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1975    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1976
1977    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1978                h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
1979
1980    mov      grain_lutq, grain_lutmp
1981    mov              hd, hm
1982    movzx    top_offxyd, offxyw
1983    shr          offxyd, 16
1984%%loop_y_hv_overlap:
1985    ; luma_src
1986%if %2
1987    mova            xm2, [lumaq+lstrideq*0+ 0]
1988    vinserti128      m2, [lumaq+lstrideq*0+32], 1
1989    mova            xm4, [lumaq+lstrideq*0+16]
1990    vinserti128      m4, [lumaq+lstrideq*0+48], 1
1991    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
1992    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
1993    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
1994    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
1995    phaddw           m2, m4
1996    phaddw           m3, m5
1997    pxor             m4, m4
1998    pavgw            m2, m4
1999    pavgw            m3, m4
2000%elif %1
2001    mova             m2, [lumaq]
2002    mova             m3, [lumaq+32]
2003%endif
2004%if %1
2005    mova             m0, [srcq]
2006%if %2
2007    mova             m1, [srcq+strideq]
2008%else
2009    mova             m1, [srcq+32]
2010%endif
2011    punpckhwd        m4, m2, m0
2012    punpcklwd        m2, m0
2013    punpckhwd        m5, m3, m1
2014    punpcklwd        m3, m1                 ; { luma, chroma }
2015    REPX {pmaddwd x, m14}, m4, m2, m5, m3
2016    REPX {paddd   x, m15}, m4, m2, m5, m3
2017    REPX {psrad   x, 6  }, m4, m2, m5, m3
2018    packusdw         m2, m4
2019    packusdw         m3, m5
2020    pminuw           m2, m10                ; clip_pixel()
2021    pminuw           m3, m10
2022%elif %2
2023    pand             m2, m10
2024    pand             m3, m10
2025%else
2026    pand             m2, m10, [lumaq+ 0]
2027    pand             m3, m10, [lumaq+32]
2028%endif
2029
2030    ; scaling[luma_src]
2031    vpbroadcastd     m7, [pd_m65536]
2032    pandn            m4, m7, m2
2033    mova             m6, m7
2034    vpgatherdd       m5, [scalingq+m4-0], m7
2035    psrld            m2, 16
2036    mova             m7, m6
2037    vpgatherdd       m4, [scalingq+m2-2], m6
2038    pblendw          m4, m5, 0x55
2039    pandn            m5, m7, m3
2040    mova             m6, m7
2041    vpgatherdd       m2, [scalingq+m5-0], m7
2042    psrld            m3, 16
2043    vpgatherdd       m5, [scalingq+m3-2], m6
2044    pblendw          m5, m2, 0x55
2045
2046    ; grain = grain_lut[offy+y][offx+x]
2047    movu             m0, [grain_lutq+offxyq*2]
2048    movd            xm2, [grain_lutq+left_offxyq*2]
2049    movu             m6, [grain_lutq+top_offxyq*2]
2050%if %2
2051    pinsrw          xm2, [grain_lutq+left_offxyq*2+82*2], 2
2052    movu             m3, [grain_lutq+offxyq*2+82*2]
2053    punpckldq       xm1, xm0, xm3           ; { cur0, cur1 }
2054%if %3
2055    vinserti128      m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left }
2056    vinserti128      m1, [grain_lutq+top_offxyq*2], 1     ; { cur0, cur1, top0 }
2057%else
2058    vinserti128      m2, [grain_lutq+topleft_offxyq*2+82*2], 1
2059    vpbroadcastd     m7, [grain_lutq+topleft_offxyq*2]
2060    vpblendd         m2, m7, 0x20
2061    movd            xm7, [grain_lutq+top_offxyq*2+82*2]
2062    punpckldq       xm7, xm6
2063    vinserti128      m1, xm7, 1
2064    movu             m7, [grain_lutq+top_offxyq*2+82*2]
2065%endif
2066    punpcklwd        m2, m1                 ; { cur, left }
2067%if %1
2068    vpbroadcastq     m1, [pw_23_22]
2069    pmaddwd          m2, m1
2070    vpbroadcastd     m1, [pd_16]
2071    paddd            m2, m1
2072    psrad            m2, 5
2073    packssdw         m2, m2
2074    vpermq           m2, m2, q3120
2075%else
2076    pmaddwd          m2, m15
2077    paddd            m2, m14
2078    psrad            m2, 5
2079    vextracti128    xm1, m2, 1
2080    packssdw        xm2, xm1
2081%endif
2082%else
2083    pinsrd          xm2, [grain_lutq+topleft_offxyq*2], 1
2084    movu             m3, [grain_lutq+offxyq*2+32]
2085    movu             m7, [grain_lutq+top_offxyq*2+32]
2086    punpckldq       xm1, xm0, xm6
2087    punpcklwd       xm2, xm1                ; { cur, left }
2088%if %1
2089    movddup         xm1, [pw_27_17_17_27]
2090    pmaddwd         xm2, xm1
2091    vpbroadcastd     m1, [pd_16]
2092    paddd           xm2, xm1
2093%else
2094    pmaddwd         xm2, xm15
2095    paddd           xm2, xm14
2096%endif
2097    psrad           xm2, 5
2098    packssdw        xm2, xm2
2099%endif
2100    pmaxsw          xm2, xm8
2101    pminsw          xm2, xm9
2102    vpblendd         m0, m2, 0x01
2103%if %2
2104    pshufd          xm2, xm2, q0321
2105    vpblendd         m3, m2, 0x01
2106%if %3 == 0
2107    pshufd          xm2, xm2, q0321
2108    vpblendd         m7, m2, 0x01
2109%endif
2110%endif
2111    pshuflw         xm2, xm2, q1032
2112    vpblendd         m2, m6, 0xfe
2113    punpckhwd        m6, m0                 ; { top, cur }
2114    punpcklwd        m2, m0
2115%if %3
2116    vpbroadcastd     m0, [pw_23_22]
2117%elif %2
2118    vpbroadcastd     m0, [pw_27_17_17_27]
2119%else
2120    vpbroadcastd     m0, [r14]
2121%endif
2122    pmaddwd          m6, m0
2123    pmaddwd          m2, m0
2124%if %1
2125    paddd            m6, m1
2126    paddd            m2, m1
2127%else
2128    paddd            m6, m14
2129    paddd            m2, m14
2130%endif
2131    psrad            m6, 5
2132    psrad            m2, 5
2133    packssdw         m2, m6
2134
2135%if %3
2136    pmaxsw           m2, m8
2137    pminsw           m2, m9
2138%else
2139%if %2
2140    punpckhwd        m6, m3, m7
2141    punpcklwd        m3, m7                 ; { cur, top }
2142%else
2143    punpckhwd        m6, m7, m3
2144    punpcklwd        m3, m7, m3             ; { top, cur }
2145%endif
2146    REPX {pmaddwd x, m0}, m6, m3
2147%if %1
2148    REPX  {paddd x, m1}, m6, m3
2149%else
2150    REPX {paddd x, m14}, m6, m3
2151%endif
2152    REPX   {psrad x, 5}, m6, m3
2153    packssdw         m3, m6
2154    pmaxsw           m2, m8
2155    pmaxsw           m3, m8
2156    pminsw           m2, m9
2157    pminsw           m3, m9
2158%endif
2159
2160    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2161    pmaddubsw        m4, m11
2162    pmaddubsw        m5, m11
2163    paddw            m4, m4
2164    paddw            m5, m5
2165    pmulhrsw         m2, m4
2166    pmulhrsw         m3, m5
2167
2168    ; dst = clip_pixel(src, noise)
2169    paddw            m0, m2, [srcq]
2170%if %2
2171    paddw            m1, m3, [srcq+strideq]
2172%else
2173    paddw            m1, m3, [srcq+32]
2174%endif
2175    pmaxsw           m0, m12
2176    pmaxsw           m1, m12
2177    pminsw           m0, m13
2178    pminsw           m1, m13
2179    mova         [dstq], m0
2180%if %2
2181    mova [dstq+strideq], m1
2182    lea            srcq, [srcq+strideq*2]
2183    lea            dstq, [dstq+strideq*2]
2184    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
2185%else
2186    mova      [dstq+32], m1
2187    add            srcq, strideq
2188    add            dstq, strideq
2189    add           lumaq, r10mp
2190%endif
2191    add      grain_lutq, 82*(2<<%2)
2192%if %2
2193    sub              hb, 2
2194    jg %%loop_y_h_overlap
2195%else
2196    dec              hb
2197    jle %%end_y_hv_overlap
2198    add              hd, 0x80000000
2199    jc %%loop_y_h_overlap
2200    add             r14, 4
2201    jmp %%loop_y_hv_overlap
2202%endif
2203%%end_y_hv_overlap:
2204    add              wq, 32>>%2
2205    jge .end
2206    mov            srcq, r9mp
2207    mov            dstq, r11mp
2208    mov           lumaq, r12mp
2209    lea            srcq, [srcq+wq*2]
2210    lea            dstq, [dstq+wq*2]
2211    lea           lumaq, [lumaq+wq*(2<<%2)]
2212    jmp %%loop_x_hv_overlap
2213%endmacro
2214
2215    %%FGUV_32x32xN_LOOP 1, %2, %3
2216.csfl:
2217    %%FGUV_32x32xN_LOOP 0, %2, %3
2218.end:
2219    RET
2220%endmacro
2221
2222GEN_GRAIN_UV_FN 420, 1, 1
2223FGUV_FN 420,         1, 1
2224GEN_GRAIN_UV_FN 422, 1, 0
2225FGUV_FN 422,         1, 0
2226GEN_GRAIN_UV_FN 444, 0, 0
2227FGUV_FN 444,         0, 0
2228
2229%endif ; ARCH_X86_64
2230