xref: /aosp_15_r20/external/libdav1d/src/x86/filmgrain16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2022, VideoLAN and dav1d authors
2; Copyright © 2022, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28%include "x86/filmgrain_common.asm"
29
30%if ARCH_X86_64
31
32SECTION_RODATA 16
33scale_mask:    db -1, -1,  0, -1, -1, -1,  4, -1, -1, -1,  8, -1, -1, -1, 12, -1
34scale_shift:           dw   7,   7,   6,   6,   5,   5,   4,   4
35pw_27_17_17_27:        dw 108,  68,  68, 108,  27,  17,  17,  27
36pw_23_22:              dw  92,  88,   0, 128,  23,  22,   0,  32
37fg_min:        times 2 dw 0
38               times 2 dw 64
39               times 2 dw 256
40fg_max:        times 2 dw 1023
41               times 2 dw 4095
42               times 2 dw 960
43               times 2 dw 3840
44               times 2 dw 940
45               times 2 dw 3760
46scale_rnd:             dd 64
47                       dd 16
48uv_offset_mul:         dd 256
49                       dd 1024
50pb_8_9_0_1:            db 8, 9, 0, 1
51
52cextern pb_0to63
53
54SECTION .text
55
56INIT_ZMM avx512icl
57cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \
58                                      grain_lut, offx, sby, see, offy, src_bak
59%define base r11-fg_min
60    lea             r11, [fg_min]
61    mov             r6d, r9m    ; bdmax
62    mov             r9d, [fg_dataq+FGData.clip_to_restricted_range]
63    mov             r7d, [fg_dataq+FGData.scaling_shift]
64    mov            sbyd, sbym
65    vpbroadcastd     m6, r9m
66    shr             r6d, 11     ; is_12bpc
67    vbroadcasti32x4  m7, [base+scale_mask]
68    shlx           r10d, r9d, r6d
69    vpbroadcastd    m10, [base+scale_shift+r7*4-32]
70    lea             r9d, [r6+r9*4]
71    vpbroadcastd     m8, [base+fg_min+r10*4]
72    kxnorw           k1, k1, k1 ; 0xffff
73    vpbroadcastd     m9, [base+fg_max+r9*4]
74    mov             r12, 0xeeeeeeeeeeeeeeee
75    vpbroadcastd    m19, [base+scale_rnd+r6*4]
76    kshiftrb         k2, k1, 4  ; 0xf
77    vpbroadcastq   xm20, [base+pw_27_17_17_27+r6*8]
78    kmovq            k3, r12
79    vpbroadcastd    m11, [base+scale_shift+r6*8+4]
80    test           sbyd, sbyd
81    setnz           r7b
82    vpbroadcastd    m12, [base+pw_27_17_17_27+r6*8+0]
83    vpbroadcastd    m13, [base+pw_27_17_17_27+r6*8+4]
84    test            r7b, [fg_dataq+FGData.overlap_flag]
85    jnz .v_overlap
86
87    imul           seed, sbyd, (173 << 24) | 37
88    add            seed, (105 << 24) | 178
89    rorx           seed, seed, 24
90    movzx          seed, seew
91    xor            seed, [fg_dataq+FGData.seed]
92    lea        src_bakq, [srcq+wq*2]
93    neg              wq
94    sub            dstq, srcq
95
96.loop_x:
97    rorx             r6, seeq, 1
98    or             seed, 0xeff4
99    test           seeb, seeh
100    lea            seed, [r6+0x8000]
101    cmovp          seed, r6d                 ; updated seed
102    rorx          offyd, seed, 8
103    rorx          offxq, seeq, 12
104    and           offyd, 0xf
105    imul          offyd, 164
106    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
107
108    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
109                sby, see, offxy, src_bak
110
111    mov      grain_lutq, grain_lutmp
112    mov              hd, hm
113.loop_y:
114    movu             m4, [grain_lutq+offxyq*2+82*0]
115    movu             m5, [grain_lutq+offxyq*2+82*2]
116    call .add_noise
117    sub              hb, 2
118    jg .loop_y
119    add              wq, 32
120    jge .end
121    lea            srcq, [src_bakq+wq*2]
122    cmp byte [fg_dataq+FGData.overlap_flag], 0
123    je .loop_x
124    test           sbyd, sbyd
125    jnz .hv_overlap
126
127    ; horizontal overlap (without vertical overlap)
128.loop_x_h_overlap:
129    rorx             r6, seeq, 1
130    or             seed, 0xeff4
131    test           seeb, seeh
132    lea            seed, [r6+0x8000]
133    cmovp          seed, r6d                 ; updated seed
134
135    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
136                sby, see, offy, src_bak, left_offxy
137
138    lea     left_offxyd, [offyq+73]          ; previous column's offy*stride+offx
139    rorx          offyd, seed, 8
140    rorx          offxq, seeq, 12
141    and           offyd, 0xf
142    imul          offyd, 164
143    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
144
145    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
146                sby, see, offxy, src_bak, left_offxy
147
148    mov      grain_lutq, grain_lutmp
149    mov              hd, hm
150.loop_y_h_overlap:
151    movu             m4, [grain_lutq+offxyq*2+82*0]
152    movu             m5, [grain_lutq+offxyq*2+82*2]
153    movd           xm17, [grain_lutq+left_offxyq*2-82*1]
154    pinsrd         xm17, [grain_lutq+left_offxyq*2+82*1], 1
155    punpckldq      xm16, xm4, xm5
156    punpcklwd      xm17, xm16
157    mova           xm16, xm19
158    vpdpwssd       xm16, xm20, xm17
159    psrad          xm16, 1
160    packssdw       xm16, xm16
161    vpsravw        xm16, xm11
162    vmovdqu8     m4{k2}, m16
163    vpalignr     m5{k2}, m16, m16, 4
164    call .add_noise
165    sub              hb, 2
166    jg .loop_y_h_overlap
167    add              wq, 32
168    jge .end
169    lea            srcq, [src_bakq+wq*2]
170    test           sbyd, sbyd
171    jnz .hv_overlap
172    jmp .loop_x_h_overlap
173
174.v_overlap:
175    movzx          sbyd, sbyb
176    imul           seed, [fg_dataq+FGData.seed], 0x00010001
177    imul            r7d, sbyd, 173 * 0x00010001
178    imul           sbyd, 37 * 0x01000100
179    add             r7d, (105 << 16) | 188
180    add            sbyd, (178 << 24) | (141 << 8)
181    and             r7d, 0x00ff00ff
182    and            sbyd, 0xff00ff00
183    xor            seed, r7d
184    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
185    lea        src_bakq, [srcq+wq*2]
186    neg              wq
187    sub            dstq, srcq
188
189    ; we assume from the block above that bits 8-15 of r7d are zero'ed
190    mov             r6d, seed
191    or             seed, 0xeff4eff4
192    test           seeb, seeh
193    setp            r7b                     ; parity of top_seed
194    shr            seed, 16
195    shl             r7d, 16
196    test           seeb, seeh
197    setp            r7b                     ; parity of cur_seed
198    or              r6d, 0x00010001
199    xor             r7d, r6d
200    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
201
202    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
203                sby, see, offy, src_bak, _, top_offxy
204
205    rorx          offyd, seed, 8
206    rorx          offxd, seed, 12
207    and           offyd, 0xf000f
208    and           offxd, 0xf000f
209    imul          offyd, 164
210    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
211    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
212
213    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
214                sby, see, offxy, src_bak, _, top_offxy
215
216    mov      grain_lutq, grain_lutmp
217    mov              hd, hm
218    movzx    top_offxyd, offxyw
219    shr          offxyd, 16
220
221    movu            m16, [grain_lutq+offxyq*2+82*0]
222    movu             m0, [grain_lutq+top_offxyq*2+82*0]
223    movu            m17, [grain_lutq+offxyq*2+82*2]
224    movu             m1, [grain_lutq+top_offxyq*2+82*2]
225    punpckhwd        m4, m0, m16
226    punpcklwd        m0, m16
227    punpckhwd        m5, m1, m17
228    punpcklwd        m1, m17
229    call .add_noise_v
230    sub              hb, 2
231    jg .loop_y
232    add              wq, 32
233    jge .end
234    lea            srcq, [src_bakq+wq*2]
235
236    ; since fg_dataq.overlap is guaranteed to be set, we never jump back
237    ; to .v_overlap, and instead always fall-through to .hv_overlap
238.hv_overlap:
239    ; we assume from the block above that bits 8-15 of r7d are zero'ed
240    mov             r6d, seed
241    or             seed, 0xeff4eff4
242    test           seeb, seeh
243    setp            r7b                     ; parity of top_seed
244    shr            seed, 16
245    shl             r7d, 16
246    test           seeb, seeh
247    setp            r7b                     ; parity of cur_seed
248    or              r6d, 0x00010001
249    xor             r7d, r6d
250    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
251
252    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
253                sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy
254
255    lea  topleft_offxyd, [top_offxyq+73]
256    lea     left_offxyd, [offyq+73]
257    rorx          offyd, seed, 8
258    rorx          offxd, seed, 12
259    and           offyd, 0xf000f
260    and           offxd, 0xf000f
261    imul          offyd, 164
262    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
263    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
264
265    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
266                sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy
267
268    mov      grain_lutq, grain_lutmp
269    mov              hd, hm
270    movzx    top_offxyd, offxyw
271    shr          offxyd, 16
272
273    movu             m5, [grain_lutq+offxyq*2+82*0]
274    movu             m0, [grain_lutq+top_offxyq*2+82*0]
275    movd           xm17, [grain_lutq+left_offxyq*2-82*1]
276    pinsrd         xm17, [grain_lutq+topleft_offxyq*2-82*1], 1
277    movu             m2, [grain_lutq+offxyq*2+82*2]
278    movu             m1, [grain_lutq+top_offxyq*2+82*2]
279    movd           xm18, [grain_lutq+left_offxyq*2+82*1]
280    pinsrd         xm18, [grain_lutq+topleft_offxyq*2+82*1], 1
281    punpckldq      xm16, xm5, xm0
282    punpcklwd      xm17, xm16
283    mova           xm16, xm19
284    vpdpwssd       xm16, xm20, xm17
285    punpckldq      xm17, xm2, xm1
286    punpcklwd      xm18, xm17
287    mova           xm17, xm19
288    vpdpwssd       xm17, xm20, xm18
289    punpckhwd        m4, m0, m5
290    punpcklwd        m0, m5
291    punpckhwd        m5, m1, m2
292    punpcklwd        m1, m2
293    psrad          xm16, 1
294    psrad          xm17, 1
295    packssdw       xm16, xm17
296    vpsravw        xm16, xm11
297    vpshuflw     m0{k2}, m16, q1302
298    punpckhqdq     xm16, xm16
299    vpshuflw     m1{k2}, m16, q1302
300    call .add_noise_v
301    sub              hb, 2
302    jg .loop_y_h_overlap
303    add              wq, 32
304    lea            srcq, [src_bakq+wq*2]
305    jl .hv_overlap
306.end:
307    RET
308ALIGN function_align
309.add_noise_v:
310    mova             m2, m19
311    vpdpwssd         m2, m12, m4
312    mova             m3, m19
313    vpdpwssd         m3, m13, m5
314    mova             m4, m19
315    vpdpwssd         m4, m12, m0
316    mova             m5, m19
317    vpdpwssd         m5, m13, m1
318    REPX   {psrad x, 1}, m2, m3, m4, m5
319    packssdw         m4, m2
320    packssdw         m5, m3
321    vpsravw          m4, m11
322    vpsravw          m5, m11
323.add_noise:
324    mova             m0, [srcq+strideq*0]
325    mova             m1, [srcq+strideq*1]
326    kmovw            k4, k1
327    pand            m16, m6, m0
328    psrld            m3, m0, 16
329    vpgatherdd   m2{k4}, [scalingq+m16]
330    vpcmpud          k4, m3, m6, 2 ; px <= bdmax
331    vpgatherdd  m16{k4}, [scalingq+m3]
332    kmovw            k4, k1
333    pand            m17, m6, m1
334    vpgatherdd   m3{k4}, [scalingq+m17]
335    vpshufb      m2{k3}, m16, m7
336    psrld           m16, m1, 16
337    vpcmpud          k4, m16, m6, 2
338    vpgatherdd  m17{k4}, [scalingq+m16]
339    vpshufb      m3{k3}, m17, m7
340    vpsllvw          m2, m10
341    vpsllvw          m3, m10
342    pmulhrsw         m4, m2
343    pmulhrsw         m5, m3
344    add      grain_lutq, 82*4
345    paddw            m0, m4
346    paddw            m1, m5
347    pmaxsw           m0, m8
348    pmaxsw           m1, m8
349    pminsw           m0, m9
350    pminsw           m1, m9
351    mova    [dstq+srcq], m0
352    add            srcq, strideq
353    mova    [dstq+srcq], m1
354    add            srcq, strideq
355    ret
356
357%macro FGUV_FN 3 ; name, ss_hor, ss_ver
358cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \
359                                           grain_lut, h, sby, luma, lstride, uv_pl, is_id
360%define base r12-fg_min
361    lea             r12, [fg_min]
362    mov             r9d, r13m            ; bdmax
363    mov             r7d, [fg_dataq+FGData.scaling_shift]
364    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
365    mov            r11d, is_idm
366    kxnorw           k1, k1, k1          ; 0xffff
367    vpbroadcastd     m5, r13m
368    mov             r13, 0xeeeeeeeeeeeeeeee
369    vbroadcasti32x4  m6, [base+scale_mask]
370    shr             r9d, 11              ; is_12bpc
371    vpbroadcastd     m7, [base+scale_shift+r7*4-32]
372    shlx           r10d, r6d, r9d
373    mov            sbyd, sbym
374    shlx            r6d, r6d, r11d
375    vpbroadcastd     m8, [base+fg_min+r10*4]
376    lea             r6d, [r9+r6*2]
377    vpbroadcastd     m9, [base+fg_max+r6*4]
378    kmovq            k2, r13
379    vpbroadcastd    m20, [base+scale_rnd+r9*4]
380    packssdw         m4, m5, m5
381    vpbroadcastd    m21, [base+scale_shift+r9*8+4]
382%if %2
383    mova            m12, [pb_0to63] ; pw_even
384    mov            r13d, 0x0101
385    vpbroadcastq    m10, [base+pw_23_22+r9*8]
386    kmovw            k3, r13d
387%if %3
388    pshufd          m11, m10, q0000
389%else
390    vpbroadcastd   ym16, [base+pw_27_17_17_27+r9*8+0]
391    vpbroadcastd    m11, [base+pw_27_17_17_27+r9*8+4]
392    vmovdqu16   m11{k1}, m16
393%endif
394    psrlw           m13, m12, 8          ; pw_odd
395%else
396    vpbroadcastq    m10, [base+pw_27_17_17_27+r9*8]
397    kshiftrb         k3, k1, 7           ; 0x01
398    kshiftrb         k4, k1, 4           ; 0x0f
399    pshufd          m11, m10, q0000
400%endif
401    mov        lstrideq, r10mp
402    test           sbyd, sbyd
403    setnz           r7b
404    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
405    jne .csfl
406
407%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
408    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
409                _, sby, see, lstride
410
411%if %1
412    mov             r6d, r11m
413    vpbroadcastd     m0, [base+uv_offset_mul+r9*4]
414    vpbroadcastd     m1, [base+pb_8_9_0_1]
415    vpbroadcastd    m14, [fg_dataq+FGData.uv_offset+r6*4]
416    vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4]
417    pmaddwd         m14, m0
418    pshufb          m15, m1 ; { uv_luma_mult, uv_mult }
419%endif
420    test            r7b, [fg_dataq+FGData.overlap_flag]
421    jnz %%v_overlap
422
423    imul           seed, sbyd, (173 << 24) | 37
424    add            seed, (105 << 24) | 178
425    rorx           seed, seed, 24
426    movzx          seed, seew
427    xor            seed, [fg_dataq+FGData.seed]
428
429    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
430                offx, offy, see, lstride, luma
431
432    mov           lumaq, r9mp
433    lea             r12, [srcq+wq*2]
434    lea             r13, [dstq+wq*2]
435    lea             r14, [lumaq+wq*(2<<%2)]
436    mov            r9mp, r12
437    mov           r10mp, r13
438    mov           r11mp, r14
439    neg              wq
440
441%%loop_x:
442    rorx             r6, seeq, 1
443    or             seed, 0xeff4
444    test           seeb, seeh
445    lea            seed, [r6+0x8000]
446    cmovp          seed, r6d               ; updated seed
447    rorx          offyd, seed, 8
448    rorx          offxq, seeq, 12
449    and           offyd, 0xf
450    imul          offyd, 164>>%3
451    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
452
453    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
454                h, offxy, see, lstride, luma
455
456    mov      grain_lutq, grain_lutmp
457    mov              hd, hm
458%%loop_y:
459%if %2
460    movu           ym18, [grain_lutq+offxyq*2+82*0]
461    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
462    movu           ym19, [grain_lutq+offxyq*2+82*4]
463    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
464%else
465    movu            m18, [grain_lutq+offxyq*2+82*0]
466    movu            m19, [grain_lutq+offxyq*2+82*2]
467%endif
468    call %%add_noise
469    sub              hb, 2<<%2
470    jg %%loop_y
471    add              wq, 32>>%2
472    jge .end
473    mov            srcq, r9mp
474    mov            dstq, r10mp
475    mov           lumaq, r11mp
476    lea            srcq, [srcq+wq*2]
477    lea            dstq, [dstq+wq*2]
478    lea           lumaq, [lumaq+wq*(2<<%2)]
479    cmp byte [fg_dataq+FGData.overlap_flag], 0
480    je %%loop_x
481    cmp       dword r8m, 0 ; sby
482    jne %%hv_overlap
483
484    ; horizontal overlap (without vertical overlap)
485%%loop_x_h_overlap:
486    rorx             r6, seeq, 1
487    or             seed, 0xEFF4
488    test           seeb, seeh
489    lea            seed, [r6+0x8000]
490    cmovp          seed, r6d               ; updated seed
491
492    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
493                offx, offy, see, lstride, luma, left_offxy
494
495    lea     left_offxyd, [offyq+(32>>%2)]  ; previous column's offy*stride+offx
496    rorx          offyd, seed, 8
497    rorx          offxq, seeq, 12
498    and           offyd, 0xf
499    imul          offyd, 164>>%3
500    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
501
502    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
503                h, offxy, see, lstride, luma, left_offxy
504
505    mov      grain_lutq, grain_lutmp
506    mov              hd, hm
507%%loop_y_h_overlap:
508%if %2
509    movu           ym18, [grain_lutq+offxyq*2+82*0]
510    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
511    movu           ym19, [grain_lutq+offxyq*2+82*4]
512    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
513    movd           xm16, [grain_lutq+left_offxyq*2+82*0]
514    vinserti32x4    m16, [grain_lutq+left_offxyq*2+82*2], 2
515    movd           xm17, [grain_lutq+left_offxyq*2+82*4]
516    vinserti32x4    m17, [grain_lutq+left_offxyq*2+82*6], 2
517    punpckldq       m16, m17
518    punpckldq       m17, m18, m19
519    punpcklwd       m16, m17
520    mova            m17, m20
521    vpdpwssd        m17, m16, m10
522    psrad           m17, 1
523    packssdw        m17, m17
524    vpsravw         m17, m21
525%else
526    movu            m18, [grain_lutq+offxyq*2+82*0]
527    movu            m19, [grain_lutq+offxyq*2+82*2]
528    movd           xm16, [grain_lutq+left_offxyq*2+82*0]
529    pinsrd         xm16, [grain_lutq+left_offxyq*2+82*2], 1
530    punpckldq      xm17, xm18, xm19
531    punpcklwd      xm16, xm17
532    mova           xm17, xm20
533    vpdpwssd       xm17, xm16, xm10
534    psrad          xm17, 1
535    packssdw       xm17, xm17
536    vpsravw        xm17, xm21
537%endif
538    vmovdqa32   m18{k3}, m17
539    vpshufd     m19{k3}, m17, q0321
540    call %%add_noise
541    sub              hb, 2<<%2
542    jg %%loop_y_h_overlap
543    add              wq, 32>>%2
544    jge .end
545    mov            srcq, r9mp
546    mov            dstq, r10mp
547    mov           lumaq, r11mp
548    lea            srcq, [srcq+wq*2]
549    lea            dstq, [dstq+wq*2]
550    lea           lumaq, [lumaq+wq*(2<<%2)]
551    cmp       dword r8m, 0 ; sby
552    jne %%hv_overlap
553    jmp %%loop_x_h_overlap
554
555%%v_overlap:
556    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
557                _, sby, see, lstride
558
559    movzx          sbyd, sbyb
560    imul           seed, [fg_dataq+FGData.seed], 0x00010001
561    imul            r7d, sbyd, 173 * 0x00010001
562    imul           sbyd, 37 * 0x01000100
563    add             r7d, (105 << 16) | 188
564    add            sbyd, (178 << 24) | (141 << 8)
565    and             r7d, 0x00ff00ff
566    and            sbyd, 0xff00ff00
567    xor            seed, r7d
568    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
569
570    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
571                offx, offy, see, lstride, luma, _, top_offxy
572
573    mov           lumaq, r9mp
574    lea             r12, [srcq+wq*2]
575    lea             r13, [dstq+wq*2]
576    lea             r14, [lumaq+wq*(2<<%2)]
577    mov            r9mp, r12
578    mov           r10mp, r13
579    mov           r11mp, r14
580    neg              wq
581
582    ; we assume from the block above that bits 8-15 of r7d are zero'ed
583    mov             r6d, seed
584    or             seed, 0xeff4eff4
585    test           seeb, seeh
586    setp            r7b                     ; parity of top_seed
587    shr            seed, 16
588    shl             r7d, 16
589    test           seeb, seeh
590    setp            r7b                     ; parity of cur_seed
591    or              r6d, 0x00010001
592    xor             r7d, r6d
593    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
594
595    rorx          offyd, seed, 8
596    rorx          offxd, seed, 12
597    and           offyd, 0xf000f
598    and           offxd, 0xf000f
599    imul          offyd, 164>>%3
600    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
601    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
602
603    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
604                h, offxy, see, lstride, luma, _, top_offxy
605
606    mov      grain_lutq, grain_lutmp
607    mov              hd, hm
608    movzx    top_offxyd, offxyw
609    shr          offxyd, 16
610
611%if %3
612    movu           ym16, [grain_lutq+offxyq*2+82*0]
613    movu            ym1, [grain_lutq+top_offxyq*2+82*0]
614    vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2]
615    movu           ym19, [grain_lutq+offxyq*2+82*4]
616    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
617    punpcklwd      ym17, ym1, ym16
618    punpckhwd       ym1, ym16
619%elif %2
620    movu           ym18, [grain_lutq+offxyq*2+82*0]
621    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
622    movu           ym17, [grain_lutq+top_offxyq*2+82*0]
623    vinserti32x8    m17, [grain_lutq+top_offxyq*2+82*2], 1
624    movu           ym19, [grain_lutq+offxyq*2+82*4]
625    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
626    punpcklwd       m16, m17, m18
627    punpckhwd       m17, m18
628%else
629    movu            m18, [grain_lutq+offxyq*2+82*0]
630    movu            m19, [grain_lutq+top_offxyq*2+82*0]
631    movu             m2, [grain_lutq+offxyq*2+82*2]
632    movu            m16, [grain_lutq+top_offxyq*2+82*2]
633    punpckhwd        m1, m19, m18
634    punpcklwd       m19, m18
635    punpckhwd       m18, m2, m16
636    punpcklwd        m2, m16
637%endif
638    call %%add_noise_v
639    sub              hb, 2<<%2
640    jg %%loop_y
641    add              wq, 32>>%2
642    jge .end
643    mov            srcq, r9mp
644    mov            dstq, r10mp
645    mov           lumaq, r11mp
646    lea            srcq, [srcq+wq*2]
647    lea            dstq, [dstq+wq*2]
648    lea           lumaq, [lumaq+wq*(2<<%2)]
649
650    ; since fg_dataq.overlap is guaranteed to be set, we never jump back
651    ; to %%v_overlap, and instead always fall-through to %%hv_overlap
652%%hv_overlap:
653    ; we assume from the block above that bits 8-15 of r7d are zero'ed
654    mov             r6d, seed
655    or             seed, 0xeff4eff4
656    test           seeb, seeh
657    setp            r7b                     ; parity of top_seed
658    shr            seed, 16
659    shl             r7d, 16
660    test           seeb, seeh
661    setp            r7b                     ; parity of cur_seed
662    or              r6d, 0x00010001
663    xor             r7d, r6d
664    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
665
666    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
667                offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
668
669    lea  topleft_offxyq, [top_offxyq+(32>>%2)]
670    lea     left_offxyq, [offyq+(32>>%2)]
671    rorx          offyd, seed, 8
672    rorx          offxd, seed, 12
673    and           offyd, 0xf000f
674    and           offxd, 0xf000f
675    imul          offyd, 164>>%3
676    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
677    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
678
679    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
680                h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
681
682    mov      grain_lutq, grain_lutmp
683    mov              hd, hm
684    movzx    top_offxyd, offxyw
685    shr          offxyd, 16
686
687    ; grain = grain_lut[offy+y][offx+x]
688%if %2
689    movd           xm16, [grain_lutq+left_offxyq*2+82*0]
690    vinserti32x4    m16, [grain_lutq+left_offxyq*2+82*2], 2
691    movd           xm17, [grain_lutq+left_offxyq*2+82*4]
692    vinserti32x4    m17, [grain_lutq+left_offxyq*2+82*6], 2
693    movu           ym18, [grain_lutq+offxyq*2+82*0]
694    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
695    movu           ym19, [grain_lutq+offxyq*2+82*4]
696    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
697    punpckldq       m16, m17
698    punpckldq       m17, m18, m19
699    punpcklwd       m16, m17
700    movu            ym1, [grain_lutq+top_offxyq*2+82*0]
701    movd           xm17, [grain_lutq+topleft_offxyq*2+82*0]
702    mova             m0, m20
703    vpdpwssd         m0, m16, m10
704%if %3
705    punpcklwd      xm17, xm1
706    mova           xm16, xm20
707    vpdpwssd       xm16, xm17, xm10
708    psrad          xm16, 1
709%else
710    vinserti32x8     m1, [grain_lutq+top_offxyq*2+82*2], 1
711    vinserti32x4    m17, [grain_lutq+topleft_offxyq*2+82*2], 2
712    punpcklwd       m17, m1
713    mova            m16, m20
714    vpdpwssd        m16, m17, m10
715    psrad           m16, 1
716%endif
717    psrad            m0, 1
718    packssdw         m0, m16
719    vpsravw          m0, m21
720    vmovdqa32   m18{k3}, m0
721    vpshufd     m19{k3}, m0, q0321
722%if %3
723    vpunpckhdq  ym1{k3}, ym0, ym0
724    punpcklwd      ym17, ym1, ym18
725    punpckhwd       ym1, ym18
726%else
727    vpunpckhdq   m1{k3}, m0, m0
728    punpcklwd       m16, m1, m18
729    punpckhwd       m17, m1, m18
730%endif
731%else
732    movu            m18, [grain_lutq+offxyq*2+82*0]
733    movu            m19, [grain_lutq+top_offxyq*2+82*0]
734    movd           xm17, [grain_lutq+left_offxyq*2+82*0]
735    pinsrd         xm17, [grain_lutq+topleft_offxyq*2+82*0], 1
736    punpckldq      xm16, xm18, xm19
737    punpcklwd      xm17, xm16
738    movu             m2, [grain_lutq+offxyq*2+82*2]
739    movu             m0, [grain_lutq+top_offxyq*2+82*2]
740    movd           xm16, [grain_lutq+left_offxyq*2+82*2]
741    pinsrd         xm16, [grain_lutq+topleft_offxyq*2+82*2], 1
742    punpckldq       xm1, xm2, xm0
743    punpcklwd       xm1, xm16, xm1
744    mova           xm16, xm20
745    vpdpwssd       xm16, xm17, xm10
746    mova           xm17, xm20
747    vpdpwssd       xm17, xm1, xm10
748    punpckhwd        m1, m19, m18
749    punpcklwd       m19, m18
750    punpckhwd       m18, m2, m0
751    punpcklwd        m2, m0
752    psrad          xm16, 1
753    psrad          xm17, 1
754    packssdw       xm16, xm17
755    vpsravw        xm16, xm21
756    vpshuflw    m19{k4}, m16, q1302
757    punpckhqdq     xm16, xm16
758    vpshuflw     m2{k4}, m16, q3120
759%endif
760    call %%add_noise_v
761    sub              hb, 2<<%2
762    jg %%loop_y_h_overlap
763    add              wq, 32>>%2
764    jge .end
765    mov            srcq, r9mp
766    mov            dstq, r10mp
767    mov           lumaq, r11mp
768    lea            srcq, [srcq+wq*2]
769    lea            dstq, [dstq+wq*2]
770    lea           lumaq, [lumaq+wq*(2<<%2)]
771    jmp %%hv_overlap
772
773ALIGN function_align
774%%add_noise_v:
775%if %3
776    mova           ym16, ym20
777    vpdpwssd       ym16, ym17, ym11
778    mova           ym17, ym20
779    vpdpwssd       ym17, ym1, ym11
780    psrad          ym16, 1
781    psrad          ym17, 1
782    packssdw       ym16, ym17
783    vpsravw     m18{k1}, m16, m21
784%elif %2
785    mova            m18, m20
786    vpdpwssd        m18, m16, m11
787    mova            m16, m20
788    vpdpwssd        m16, m17, m11
789    psrad           m18, 1
790    psrad           m16, 1
791    packssdw        m18, m16
792    vpsravw         m18, m21
793%else
794    mova            m16, m20
795    vpdpwssd        m16, m1, m11
796    mova            m17, m20
797    vpdpwssd        m17, m18, m11
798    mova            m18, m20
799    vpdpwssd        m18, m19, m11
800    mova            m19, m20
801    vpdpwssd        m19, m2, m11
802    REPX   {psrad x, 1}, m16, m17, m18, m19
803    packssdw        m18, m16
804    packssdw        m19, m17
805    vpsravw         m18, m21
806    vpsravw         m19, m21
807%endif
808%%add_noise:
809%if %2
810    mova             m2, [lumaq+lstrideq*(0<<%3)]
811    mova             m0, [lumaq+lstrideq*(1<<%3)]
812    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
813    mova             m3, [lumaq+lstrideq*(0<<%3)]
814    mova             m1, [lumaq+lstrideq*(1<<%3)]
815    mova            m16, m12
816    vpermi2w        m16, m2, m0
817    vpermt2w         m2, m13, m0
818    mova            m17, m12
819    vpermi2w        m17, m3, m1
820    vpermt2w         m3, m13, m1
821    pavgw            m2, m16
822    pavgw            m3, m17
823%elif %1
824    mova             m2, [lumaq+lstrideq*0]
825    mova             m3, [lumaq+lstrideq*1]
826%endif
827%if %2
828    mova           ym16, [srcq+strideq*0]
829    vinserti32x8    m16, [srcq+strideq*1], 1
830    lea            srcq, [srcq+strideq*2]
831%else
832    mova            m16, [srcq+strideq*0]
833%endif
834%if %1
835    punpckhwd       m17, m2, m16
836    mova             m0, m14
837    vpdpwssd         m0, m17, m15
838    punpcklwd       m17, m2, m16
839    mova             m2, m14
840    vpdpwssd         m2, m17, m15
841%endif
842%if %2
843    mova           ym17, [srcq+strideq*0]
844    vinserti32x8    m17, [srcq+strideq*1], 1
845%else
846    mova            m17, [srcq+strideq*1]
847%endif
848%if %1
849    psrad            m0, 6
850    psrad            m2, 6
851    packusdw         m2, m0
852    punpckhwd        m0, m3, m17
853    mova             m1, m14
854    vpdpwssd         m1, m15, m0
855    punpcklwd        m0, m3, m17
856    mova             m3, m14
857    vpdpwssd         m3, m15, m0
858    psrad            m1, 6
859    psrad            m3, 6
860    packusdw         m3, m1
861    pminuw           m2, m4
862    pminuw           m3, m4
863
864.add_noise_main:
865    ; scaling[luma_src]
866    kmovw            k5, k1
867    pand             m1, m5, m2
868    vpgatherdd   m0{k5}, [scalingq+m1]
869    kmovw            k5, k1
870    psrld            m2, 16
871    vpgatherdd   m1{k5}, [scalingq+m2]
872    vpshufb      m0{k2}, m1, m6
873    kmovw            k5, k1
874    psrld            m1, m3, 16
875    vpgatherdd   m2{k5}, [scalingq+m1]
876    kmovw            k5, k1
877    pand             m3, m5
878    vpgatherdd   m1{k5}, [scalingq+m3]
879    vpshufb      m1{k2}, m2, m6
880
881    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
882    vpsllvw          m0, m7
883    vpsllvw          m1, m7
884    pmulhrsw        m18, m0
885    pmulhrsw        m19, m1
886    add      grain_lutq, 82*(4<<%2)
887    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
888    lea            srcq, [srcq+strideq*2]
889    paddw           m16, m18
890    paddw           m17, m19
891    pmaxsw          m16, m8
892    pmaxsw          m17, m8
893    pminsw          m16, m9
894    pminsw          m17, m9
895%if %2
896    mova          [dstq+strideq*0], ym16
897    vextracti32x8 [dstq+strideq*1], m16, 1
898    lea            dstq, [dstq+strideq*2]
899    mova          [dstq+strideq*0], ym17
900    vextracti32x8 [dstq+strideq*1], m17, 1
901%else
902    mova [dstq+strideq*0], m16
903    mova [dstq+strideq*1], m17
904%endif
905    lea            dstq, [dstq+strideq*2]
906    ret
907%else
908%if %2
909    pand             m2, m4
910    pand             m3, m4
911%else
912    pand             m2, m4, [lumaq+lstrideq*0]
913    pand             m3, m4, [lumaq+lstrideq*1]
914%endif
915    jmp .add_noise_main
916%endif
917%endmacro
918
919    %%FGUV_32x32xN_LOOP 1, %2, %3
920.csfl:
921    %%FGUV_32x32xN_LOOP 0, %2, %3
922.end:
923    RET
924%endmacro
925
926FGUV_FN 420, 1, 1
927FGUV_FN 422, 1, 0
928FGUV_FN 444, 0, 0
929
930%endif
931