xref: /aosp_15_r20/external/libdav1d/src/x86/filmgrain_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2022, VideoLAN and dav1d authors
2; Copyright © 2022, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28%include "x86/filmgrain_common.asm"
29
30%if ARCH_X86_64
31
32SECTION_RODATA 64
33
34pb_even:       db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
35               db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
36               db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
37               db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
38pb_odd:        db  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
39               db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
40               db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
41               db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
42interleave_hl: db  8,  0,  9,  1, 10,  2, 11,  3, 12,  4, 13,  5, 14,  6, 15,  7
43pb_27_17_17_27:        db 27, 17, 17, 27,  0, 32,  0, 32
44pb_23_22_0_32:         db 23, 22,  0, 32,  0, 32,  0, 32
45pb_27_17:      times 2 db 27, 17
46pb_23_22:      times 2 db 23, 22
47pw_8:          times 2 dw 8
48pw_1024:       times 2 dw 1024
49pb_17_27:      times 2 db 17, 27
50fg_max:        times 4 db 255
51               times 4 db 240
52               times 4 db 235
53fg_min:        times 4 db 0
54               times 4 db 16
55noise_rnd:     times 2 dw 128
56               times 2 dw 64
57               times 2 dw 32
58               times 2 dw 16
59
60SECTION .text
61
62INIT_ZMM avx512icl
63cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \
64                                     grain_lut, h, sby, see, overlap
65%define base r11-fg_min
66    lea             r11, [fg_min]
67    mov             r6d, [fg_dataq+FGData.scaling_shift]
68    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
69    mov            sbyd, sbym
70    mov        overlapd, [fg_dataq+FGData.overlap_flag]
71    mov             r12, 0x0000000f0000000f ; h_overlap mask
72    mova             m0, [scalingq+64*0]
73    mova             m1, [scalingq+64*1]
74    mova             m2, [scalingq+64*2]
75    mova             m3, [scalingq+64*3]
76    kmovq            k1, r12
77    vbroadcasti32x4  m4, [base+interleave_hl]
78    vpbroadcastd   ym16, [base+pb_27_17]
79    vpbroadcastd    m12, [base+pb_17_27]
80    vpbroadcastd     m6, [base+noise_rnd+r6*4-32]
81    test           sbyd, sbyd
82    setnz           r6b
83    vpbroadcastd     m7, [base+fg_min+r7*4]
84    vpbroadcastd     m8, [base+fg_max+r7*8]
85    pxor             m5, m5
86    vpbroadcastd     m9, [base+pw_1024]
87    vpbroadcastq    m10, [base+pb_27_17_17_27]
88    vmovdqa64   m12{k1}, m16
89    test            r6b, overlapb
90    jnz .v_overlap
91
92    imul           seed, sbyd, (173 << 24) | 37
93    add            seed, (105 << 24) | 178
94    rorx           seed, seed, 24
95    movzx          seed, seew
96    xor            seed, [fg_dataq+FGData.seed]
97
98    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
99                h, sby, see, overlap
100
101    lea        src_bakq, [srcq+wq]
102    neg              wq
103    sub            dstq, srcq
104.loop_x:
105    rorx             r6, seeq, 1
106    or             seed, 0xeff4
107    test           seeb, seeh
108    lea            seed, [r6+0x8000]
109    cmovp          seed, r6d                 ; updated seed
110    rorx          offyd, seed, 8
111    rorx          offxq, seeq, 12
112    and           offyd, 0xf
113    imul          offyd, 164
114    lea           offxd, [offyq+offxq*2+829] ; offy*stride+offx
115
116    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
117                h, sby, see, overlap
118
119    mov      grain_lutq, grain_lutmp
120    mov              hd, hm
121.loop_y:
122    movu           ym21, [grain_lutq+offxyq-82]
123    vinserti32x8    m21, [grain_lutq+offxyq+ 0], 1
124    call .add_noise
125    sub              hb, 2
126    jg .loop_y
127    add              wq, 32
128    jge .end
129    lea            srcq, [src_bakq+wq]
130    test       overlapd, overlapd
131    jz .loop_x
132    test           sbyd, sbyd
133    jnz .hv_overlap
134
135.loop_x_h_overlap:
136    rorx             r6, seeq, 1
137    or             seed, 0xeff4
138    test           seeb, seeh
139    lea            seed, [r6+0x8000]
140    cmovp          seed, r6d                 ; updated seed
141
142    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
143                h, sby, see, left_offxy
144
145    rorx          offyd, seed, 8
146    mov     left_offxyd, offxd               ; previous column's offy*stride
147    rorx          offxq, seeq, 12
148    and           offyd, 0xf
149    imul          offyd, 164
150    lea           offxd, [offyq+offxq*2+829] ; offy*stride+offx
151
152    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
153                h, sby, see, left_offxy
154
155    mov      grain_lutq, grain_lutmp
156    mov              hd, hm
157.loop_y_h_overlap:
158    movu           ym20, [grain_lutq+offxyq-82]
159    vinserti32x8    m20, [grain_lutq+offxyq+ 0], 1
160    movd           xm19, [grain_lutq+left_offxyq-50]
161    vinserti32x4    m19, [grain_lutq+left_offxyq+32], 2
162    punpcklbw       m19, m20
163    pmaddubsw       m19, m10, m19
164    pmulhrsw        m19, m9
165    punpckhbw       m21, m20, m5
166    packsswb    m20{k1}, m19, m19
167    punpcklbw       m20, m5, m20
168    call .add_noise_h
169    sub              hb, 2
170    jg .loop_y_h_overlap
171    add              wq, 32
172    jge .end
173    lea            srcq, [src_bakq+wq]
174    test           sbyd, sbyd
175    jnz .hv_overlap
176    jmp .loop_x_h_overlap
177
178.v_overlap:
179    DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \
180                h, sby, see, overlap
181
182    movzx           r6d, sbyb
183    imul           seed, [fg_dataq+FGData.seed], 0x00010001
184    imul            r7d, r6d, 173 * 0x00010001
185    imul            r6d, 37 * 0x01000100
186    add             r7d, (105 << 16) | 188
187    add             r6d, (178 << 24) | (141 << 8)
188    and             r7d, 0x00ff00ff
189    and             r6d, 0xff00ff00
190    xor            seed, r7d
191    xor            seed, r6d     ; (cur_seed << 16) | top_seed
192
193    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
194                h, sby, see, overlap
195
196    lea        src_bakq, [srcq+wq]
197    neg              wq
198    sub            dstq, srcq
199
200    ; we assume from the block above that bits 8-15 of r7d are zero'ed
201    mov             r6d, seed
202    or             seed, 0xeff4eff4
203    test           seeb, seeh
204    setp            r7b          ; parity of top_seed
205    shr            seed, 16
206    shl             r7d, 16
207    test           seeb, seeh
208    setp            r7b          ; parity of cur_seed
209    or              r6d, 0x00010001
210    xor             r7d, r6d
211    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
212    rorx          offyd, seed, 8
213    rorx          offxd, seed, 12
214    and           offyd, 0xf000f
215    and           offxd, 0xf000f
216    imul          offyd, 164
217    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
218    lea           offxd, [offyq+offxq*2+0x10001*829+32*82]
219
220    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
221                h, sby, see, overlap, top_offxy
222
223    mov      grain_lutq, grain_lutmp
224    mov              hd, hm
225    movzx    top_offxyd, offxyw
226    shr          offxyd, 16
227    movu           ym19, [grain_lutq+offxyq-82]
228    vinserti32x8    m19, [grain_lutq+offxyq+ 0], 1
229    movu           ym21, [grain_lutq+top_offxyq-82]
230    vinserti32x8    m21, [grain_lutq+top_offxyq+ 0], 1
231    punpckhbw       m20, m21, m19
232    punpcklbw       m21, m19
233    call .add_noise_v
234    sub              hb, 2
235    jg .loop_y
236    add              wq, 32
237    jge .end
238    lea            srcq, [src_bakq+wq]
239
240    ; since fg_dataq.overlap is guaranteed to be set, we never jump back
241    ; to .v_overlap, and instead always fall-through to h+v overlap
242.hv_overlap:
243    ; we assume from the block above that bits 8-15 of r7d are zero'ed
244    mov             r6d, seed
245    or             seed, 0xeff4eff4
246    test           seeb, seeh
247    setp            r7b          ; parity of top_seed
248    shr            seed, 16
249    shl             r7d, 16
250    test           seeb, seeh
251    setp            r7b          ; parity of cur_seed
252    or              r6d, 0x00010001
253    xor             r7d, r6d
254    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
255
256    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
257                h, sby, see, left_offxy, top_offxy, topleft_offxy
258
259    mov  topleft_offxyd, top_offxyd
260    rorx          offyd, seed, 8
261    mov     left_offxyd, offxd
262    rorx          offxd, seed, 12
263    and           offyd, 0xf000f
264    and           offxd, 0xf000f
265    imul          offyd, 164
266    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
267    lea           offxd, [offyq+offxq*2+0x10001*829+32*82]
268
269    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
270                h, sby, see, left_offxy, top_offxy, topleft_offxy
271
272    mov      grain_lutq, grain_lutmp
273    mov              hd, hm
274    movzx    top_offxyd, offxyw
275    shr          offxyd, 16
276    movu           ym19, [grain_lutq+offxyq-82]
277    vinserti32x8    m19, [grain_lutq+offxyq+ 0], 1
278    movd           xm16, [grain_lutq+left_offxyq-50]
279    vinserti32x4    m16, [grain_lutq+left_offxyq+32], 2
280    movu           ym21, [grain_lutq+top_offxyq-82]
281    vinserti32x8    m21, [grain_lutq+top_offxyq+ 0], 1
282    movd           xm17, [grain_lutq+topleft_offxyq-50]
283    vinserti32x4    m17, [grain_lutq+topleft_offxyq+32], 2
284    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
285    punpcklbw       m16, m19
286    pmaddubsw       m16, m10, m16
287    punpcklbw       m17, m21
288    pmaddubsw       m17, m10, m17
289    punpckhbw       m20, m21, m19
290    pmulhrsw        m16, m9
291    pmulhrsw        m17, m9
292    packsswb    m19{k1}, m16, m16
293    packsswb    m21{k1}, m17, m17
294    ; followed by v interpolation (top | cur -> cur)
295    punpcklbw       m21, m19
296    call .add_noise_v
297    sub              hb, 2
298    jg .loop_y_h_overlap
299    add              wq, 32
300    lea            srcq, [src_bakq+wq]
301    jl .hv_overlap
302.end:
303    RET
304ALIGN function_align
305.add_noise_v:
306    pmaddubsw       m20, m12, m20
307    pmaddubsw       m21, m12, m21
308    pmulhrsw        m20, m9
309    pmulhrsw        m21, m9
310    packsswb        m21, m20
311.add_noise:
312    punpcklbw       m20, m5, m21
313    punpckhbw       m21, m5
314.add_noise_h:
315    mova           ym18, [srcq+strideq*0]
316    vinserti32x8    m18, [srcq+strideq*1], 1
317    mova            m19, m0
318    punpcklbw       m16, m18, m5
319    vpermt2b        m19, m18, m1 ; scaling[  0..127]
320    vpmovb2m         k2, m18
321    punpckhbw       m17, m18, m5
322    vpermi2b        m18, m2, m3  ; scaling[128..255]
323    vmovdqu8    m19{k2}, m18     ; scaling[src]
324    pshufb          m19, m4
325    pmaddubsw       m18, m19, m20
326    pmaddubsw       m19, m21
327    add      grain_lutq, 82*2
328    pmulhrsw        m18, m6      ; noise
329    pmulhrsw        m19, m6
330    paddw           m16, m18
331    paddw           m17, m19
332    packuswb        m16, m17
333    pmaxub          m16, m7
334    pminub          m16, m8
335    mova    [dstq+srcq], ym16
336    add            srcq, strideq
337    vextracti32x8 [dstq+srcq], m16, 1
338    add            srcq, strideq
339    ret
340
341%macro FGUV_FN 3 ; name, ss_hor, ss_ver
342cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \
343                                             scaling, grain_lut, h, sby, luma, \
344                                             overlap, uv_pl, is_id, _, stride3
345    lea             r11, [fg_min]
346    mov             r6d, [fg_dataq+FGData.scaling_shift]
347    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
348    mov             r9d, is_idm
349    mov            sbyd, sbym
350    mov        overlapd, [fg_dataq+FGData.overlap_flag]
351%if %2
352    mov             r12, 0x000f000f000f000f ; h_overlap mask
353    vpbroadcastq    m10, [base+pb_23_22_0_32]
354    lea        stride3q, [strideq*3]
355%else
356    mov             r12, 0x0000000f0000000f
357    vpbroadcastq    m10, [base+pb_27_17_17_27]
358%endif
359    mova             m0, [scalingq+64*0]
360    mova             m1, [scalingq+64*1]
361    mova             m2, [scalingq+64*2]
362    mova             m3, [scalingq+64*3]
363    kmovq            k1, r12
364    vbroadcasti32x4  m4, [base+interleave_hl]
365    vpbroadcastd     m6, [base+noise_rnd+r6*4-32]
366    vpbroadcastd     m7, [base+fg_min+r7*4]
367    shlx            r7d, r7d, r9d
368    vpbroadcastd     m8, [base+fg_max+r7*4]
369    test           sbyd, sbyd
370    setnz           r7b
371    vpbroadcastd     m9, [base+pw_1024]
372    mova            m11, [base+pb_even]
373    mova            m12, [base+pb_odd]
374    pxor             m5, m5
375    mov              r5, r10mp      ; lstride
376    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
377    jne .csfl
378
379%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
380    DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
381                h, sby, see, overlap, uv_pl, _, _, stride3
382%if %1
383    mov             r6d, uv_plm
384    vpbroadcastd    m16, [base+pw_8]
385    vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4]
386    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r6*4]
387    pshufb          m14, m16     ; uv_luma_mult, uv_mult
388%endif
389    test            r7b, overlapb
390    jnz %%v_overlap
391
392    imul           seed, sbyd, (173 << 24) | 37
393    add            seed, (105 << 24) | 178
394    rorx           seed, seed, 24
395    movzx          seed, seew
396    xor            seed, [fg_dataq+FGData.seed]
397
398    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
399                offx, offy, see, overlap, _, _, _, stride3
400
401    mov           lumaq, r9mp
402    lea             r11, [srcq+wq]
403    lea             r12, [dstq+wq]
404    lea             r13, [lumaq+wq*(1+%2)]
405    mov           r11mp, r11
406    mov           r12mp, r12
407    neg              wq
408
409%%loop_x:
410    rorx             r6, seeq, 1
411    or             seed, 0xeff4
412    test           seeb, seeh
413    lea            seed, [r6+0x8000]
414    cmovp          seed, r6d     ; updated seed
415    rorx          offyd, seed, 8
416    rorx          offxq, seeq, 12
417    and           offyd, 0xf
418    imul          offyd, 164>>%3
419    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
420
421    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
422                h, offxy, see, overlap, _, _, _, stride3
423
424    mov      grain_lutq, grain_lutmp
425    mov              hd, hm
426%%loop_y:
427%if %2
428    movu           xm21, [grain_lutq+offxyq+82*0]
429    vinserti128    ym21, [grain_lutq+offxyq+82*1], 1
430    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
431    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
432%else
433    movu           ym21, [grain_lutq+offxyq+82*0]
434    vinserti32x8    m21, [grain_lutq+offxyq+82*1], 1
435%endif
436    call %%add_noise
437    sub              hb, 2<<%2
438    jg %%loop_y
439    add              wq, 32>>%2
440    jge .end
441    mov            srcq, r11mp
442    mov            dstq, r12mp
443    lea           lumaq, [r13+wq*(1<<%2)]
444    add            srcq, wq
445    add            dstq, wq
446    test       overlapd, overlapd
447    jz %%loop_x
448    cmp       dword r8m, 0       ; sby
449    jne %%hv_overlap
450
451    ; horizontal overlap (without vertical overlap)
452%%loop_x_h_overlap:
453    rorx             r6, seeq, 1
454    or             seed, 0xeff4
455    test           seeb, seeh
456    lea            seed, [r6+0x8000]
457    cmovp          seed, r6d     ; updated seed
458
459    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
460                offx, offy, see, left_offxy, _, _, _, stride3
461
462    lea     left_offxyd, [offyq+(32>>%2)]         ; previous column's offy*stride+offx
463    rorx          offyd, seed, 8
464    rorx          offxq, seeq, 12
465    and           offyd, 0xf
466    imul          offyd, 164>>%3
467    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
468
469    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
470                h, offxy, see, left_offxy, _, _, _, stride3
471
472    mov      grain_lutq, grain_lutmp
473    mov              hd, hm
474%%loop_y_h_overlap:
475%if %2
476    movu           xm20, [grain_lutq+offxyq     +82*0]
477    movd           xm19, [grain_lutq+left_offxyq+82*0]
478    vinserti32x4   ym20, [grain_lutq+offxyq     +82*1], 1
479    vinserti32x4   ym19, [grain_lutq+left_offxyq+82*1], 1
480    vinserti32x4    m20, [grain_lutq+offxyq     +82*2], 2
481    vinserti32x4    m19, [grain_lutq+left_offxyq+82*2], 2
482    vinserti32x4    m20, [grain_lutq+offxyq     +82*3], 3
483    vinserti32x4    m19, [grain_lutq+left_offxyq+82*3], 3
484%else
485    movu           ym20, [grain_lutq+offxyq     + 0]
486    movd           xm19, [grain_lutq+left_offxyq+ 0]
487    vinserti32x8    m20, [grain_lutq+offxyq     +82], 1
488    vinserti32x4    m19, [grain_lutq+left_offxyq+82], 2
489%endif
490    punpcklbw       m19, m20
491    pmaddubsw       m19, m10, m19
492    punpckhbw       m21, m20, m5
493    pmulhrsw        m19, m9
494    vpacksswb   m20{k1}, m19, m19
495    punpcklbw       m20, m5, m20
496    call %%add_noise_h
497    sub              hb, 2<<%2
498    jg %%loop_y_h_overlap
499    add              wq, 32>>%2
500    jge .end
501    mov            srcq, r11mp
502    mov            dstq, r12mp
503    lea           lumaq, [r13+wq*(1<<%2)]
504    add            srcq, wq
505    add            dstq, wq
506    cmp       dword r8m, 0       ; sby
507    jne %%hv_overlap
508    jmp %%loop_x_h_overlap
509
510%%v_overlap:
511    DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
512                _, sby, see, overlap, _, _, _, stride3
513
514    movzx          sbyd, sbyb
515    imul           seed, [fg_dataq+FGData.seed], 0x00010001
516    imul            r7d, sbyd, 173 * 0x00010001
517    imul           sbyd, 37 * 0x01000100
518    add             r7d, (105 << 16) | 188
519    add            sbyd, (178 << 24) | (141 << 8)
520    and             r7d, 0x00ff00ff
521    and            sbyd, 0xff00ff00
522    xor            seed, r7d
523    xor            seed, sbyd    ; (cur_seed << 16) | top_seed
524
525%if %3
526    vpbroadcastd    m13, [base+pb_23_22]
527    kxnorw           k3, k3, k3  ; v_overlap mask
528%elif %2
529    vbroadcasti32x8 m13, [base+pb_27_17]
530    kxnord           k3, k3, k3
531    pshufd          m13, m13, q0000 ; 8x27_17, 8x17_27
532%else
533    vpbroadcastd   ym16, [base+pb_27_17]
534    vpbroadcastd    m13, [base+pb_17_27]
535    vmovdqa64   m13{k1}, m16
536%endif
537
538    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
539                offx, offy, see, overlap, top_offxy, _, _, stride3
540
541    mov           lumaq, r9mp
542    lea             r11, [srcq+wq]
543    lea             r12, [dstq+wq]
544    lea             r13, [lumaq+wq*(1<<%2)]
545    mov           r11mp, r11
546    mov           r12mp, r12
547    neg              wq
548
549    ; we assume from the block above that bits 8-15 of r7d are zero'ed
550    mov             r6d, seed
551    or             seed, 0xeff4eff4
552    test           seeb, seeh
553    setp            r7b          ; parity of top_seed
554    shr            seed, 16
555    shl             r7d, 16
556    test           seeb, seeh
557    setp            r7b          ; parity of cur_seed
558    or              r6d, 0x00010001
559    xor             r7d, r6d
560    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
561    rorx          offyd, seed, 8
562    rorx          offxd, seed, 12
563    and           offyd, 0x000f000f
564    and           offxd, 0x000f000f
565    imul          offyd, 164>>%3
566    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
567    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
568
569    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
570                h, offxy, see, overlap, top_offxy, _, _, stride3
571
572    mov      grain_lutq, grain_lutmp
573    mov              hd, hm
574    movzx    top_offxyd, offxyw
575    shr          offxyd, 16
576
577%if %3
578    movu           xm18, [grain_lutq+offxyq+82*0]
579    movu           xm20, [grain_lutq+top_offxyq+82*0]
580    ; only interpolate first line, insert remaining line unmodified
581    vbroadcasti128 ym21, [grain_lutq+offxyq+82*1]
582    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
583    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
584    punpcklbw      xm19, xm20, xm18
585    punpckhbw      xm20, xm18
586%elif %2
587    movu           xm18, [grain_lutq+offxyq+82*0]
588    vinserti128    ym18, [grain_lutq+offxyq+82*1], 1
589    movu           xm20, [grain_lutq+top_offxyq+82*0]
590    vinserti32x4   ym20, [grain_lutq+top_offxyq+82*1], 1
591    vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2]
592    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
593    punpcklbw      ym19, ym20, ym18
594    punpckhbw      ym20, ym18
595%else
596    movu           ym21, [grain_lutq+offxyq+82*0]
597    vinserti32x8    m21, [grain_lutq+offxyq+82*1], 1
598    movu           ym20, [grain_lutq+top_offxyq+82*0]
599    vinserti32x8    m20, [grain_lutq+top_offxyq+82*1], 1
600%endif
601    call %%add_noise_v
602    sub              hb, 2<<%2
603    jg %%loop_y
604    add              wq, 32>>%2
605    jge .end
606    mov            srcq, r11mp
607    mov            dstq, r12mp
608    lea           lumaq, [r13+wq*(1<<%2)]
609    add            srcq, wq
610    add            dstq, wq
611
612%%hv_overlap:
613    ; we assume from the block above that bits 8-15 of r7d are zero'ed
614    mov             r6d, seed
615    or             seed, 0xeff4eff4
616    test           seeb, seeh
617    setp            r7b          ; parity of top_seed
618    shr            seed, 16
619    shl             r7d, 16
620    test           seeb, seeh
621    setp            r7b          ; parity of cur_seed
622    or              r6d, 0x00010001
623    xor             r7d, r6d
624    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
625
626    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
627                offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
628
629    lea  topleft_offxyd, [top_offxyq+(32>>%2)]
630    lea     left_offxyd, [offyq+(32>>%2)]
631    rorx          offyd, seed, 8
632    rorx          offxd, seed, 12
633    and           offyd, 0x000f000f
634    and           offxd, 0x000f000f
635    imul          offyd, 164>>%3
636    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
637    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
638
639    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
640                h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
641
642    mov      grain_lutq, grain_lutmp
643    mov              hd, hm
644    movzx    top_offxyd, offxyw
645    shr          offxyd, 16
646
647%if %2
648    movu           xm21, [grain_lutq+offxyq+82*0]
649    movd           xm16, [grain_lutq+left_offxyq+82*0]
650    vinserti128    ym21, [grain_lutq+offxyq+82*1], 1
651    vinserti128    ym16, [grain_lutq+left_offxyq+82*1], 1
652    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
653    vinserti32x4    m16, [grain_lutq+left_offxyq+82*2], 2
654    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
655    vinserti32x4    m16, [grain_lutq+left_offxyq+82*3], 3
656    movd           xm18, [grain_lutq+topleft_offxyq+82*0]
657    movu           xm20, [grain_lutq+top_offxyq]
658    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
659    punpcklbw       m16, m21
660%if %3
661    punpcklbw      xm18, xm20
662%else
663    vinserti128    ym18, [grain_lutq+topleft_offxyq+82*1], 1
664    vinserti128    ym20, [grain_lutq+top_offxyq+82*1], 1
665    punpcklbw      ym18, ym20
666%endif
667    punpcklqdq      m16, m18
668    pmaddubsw       m16, m10, m16
669    pmulhrsw        m16, m9
670    packsswb        m16, m16
671    vmovdqu8    m21{k1}, m16
672%if %3
673    vpalignr   xm20{k1}, xm16, xm16, 4
674    punpcklbw      xm19, xm20, xm21
675    punpckhbw      xm20, xm21
676%else
677    vpalignr   ym20{k1}, ym16, ym16, 4
678    punpcklbw      ym19, ym20, ym21
679    punpckhbw      ym20, ym21
680%endif
681%else
682    movu           ym21, [grain_lutq+offxyq+82*0]
683    vinserti32x8    m21, [grain_lutq+offxyq+82*1], 1
684    movd           xm16, [grain_lutq+left_offxyq+82*0]
685    vinserti32x4    m16, [grain_lutq+left_offxyq+82*1], 2
686    movu           ym20, [grain_lutq+top_offxyq+82*0]
687    vinserti32x8    m20, [grain_lutq+top_offxyq+82*1], 1
688    movd           xm18, [grain_lutq+topleft_offxyq+82*0]
689    vinserti32x4    m18, [grain_lutq+topleft_offxyq+82*1], 2
690    punpcklbw       m16, m21
691    punpcklbw       m18, m20
692    punpcklqdq      m16, m18
693    pmaddubsw       m16, m10, m16
694    pmulhrsw        m16, m9
695    packsswb        m16, m16
696    vpalignr    m20{k1}, m16, m16, 4
697    vmovdqu8    m21{k1}, m16
698%endif
699    call %%add_noise_v
700    sub              hb, 2<<%2
701    jg %%loop_y_h_overlap
702    add              wq, 32>>%2
703    jge .end
704    mov            srcq, r11mp
705    mov            dstq, r12mp
706    lea           lumaq, [r13+wq*(1<<%2)]
707    add            srcq, wq
708    add            dstq, wq
709    jmp %%hv_overlap
710ALIGN function_align
711%%add_noise_v:
712%if %3
713    pmaddubsw      xm19, xm13, xm19
714    pmaddubsw      xm20, xm13, xm20
715    pmulhrsw       xm19, xm9
716    pmulhrsw       xm20, xm9
717    vpacksswb   m21{k3}, m19, m20
718%elif %2
719    pmaddubsw      ym19, ym13, ym19
720    pmaddubsw      ym20, ym13, ym20
721    pmulhrsw       ym19, ym9
722    pmulhrsw       ym20, ym9
723    vpacksswb   m21{k3}, m19, m20
724%else
725    punpcklbw       m19, m20, m21
726    punpckhbw       m20, m21
727    pmaddubsw       m19, m13, m19
728    pmaddubsw       m20, m13, m20
729    pmulhrsw        m19, m9
730    pmulhrsw        m20, m9
731    packsswb        m21, m19, m20
732%endif
733%%add_noise:
734    punpcklbw       m20, m5, m21
735    punpckhbw       m21, m5
736%%add_noise_h:
737    mova           ym18, [lumaq+lstrideq*(0<<%3)]
738    vinserti32x8    m18, [lumaq+lstrideq*(1<<%3)], 1
739%if %2
740    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
741    mova           ym16, [lumaq+lstrideq*(0<<%3)]
742    vinserti32x8    m16, [lumaq+lstrideq*(1<<%3)], 1
743    mova           xm17, [srcq+strideq*0]
744    mova            m19, m11
745    vpermi2b        m19, m18, m16
746    vinserti128    ym17, [srcq+strideq*1], 1
747    vpermt2b        m18, m12, m16
748    vinserti32x4    m17, [srcq+strideq*2], 2
749    pavgb           m18, m19
750    vinserti32x4    m17, [srcq+stride3q ], 3
751%else
752    mova           ym17, [srcq+strideq*0]
753    vinserti32x8    m17, [srcq+strideq*1], 1
754%endif
755%if %1
756    punpckhbw       m19, m18, m17
757    punpcklbw       m18, m17     ; { luma, chroma }
758    pmaddubsw       m19, m14
759    pmaddubsw       m18, m14
760    psraw           m19, 6
761    psraw           m18, 6
762    paddw           m19, m15
763    paddw           m18, m15
764    packuswb        m18, m19
765.add_noise_main:
766    mova            m19, m0
767    vpermt2b        m19, m18, m1 ; scaling[  0..127]
768    vpmovb2m         k2, m18
769    vpermi2b        m18, m2, m3  ; scaling[128..255]
770    vmovdqu8    m19{k2}, m18     ; scaling[src]
771    pshufb          m19, m4
772    pmaddubsw       m18, m19, m20
773    pmaddubsw       m19, m21
774    add      grain_lutq, 82*2<<%2
775    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
776    lea            srcq, [srcq+strideq*(2<<%2)]
777    pmulhrsw        m18, m6      ; noise
778    pmulhrsw        m19, m6
779    punpcklbw       m16, m17, m5 ; chroma
780    punpckhbw       m17, m5
781    paddw           m16, m18
782    paddw           m17, m19
783    packuswb        m16, m17
784    pmaxub          m16, m7
785    pminub          m16, m8
786%if %2
787    mova          [dstq+strideq*0], xm16
788    vextracti128  [dstq+strideq*1], ym16, 1
789    vextracti32x4 [dstq+strideq*2], m16, 2
790    vextracti32x4 [dstq+stride3q ], m16, 3
791%else
792    mova          [dstq+strideq*0], ym16
793    vextracti32x8 [dstq+strideq*1], m16, 1
794%endif
795    lea            dstq, [dstq+strideq*(2<<%2)]
796    ret
797%else
798    jmp .add_noise_main
799%endif
800%endmacro
801
802    %%FGUV_32x32xN_LOOP 1, %2, %3
803.csfl:
804    %%FGUV_32x32xN_LOOP 0, %2, %3
805.end:
806    RET
807%endmacro
808
809FGUV_FN 420, 1, 1
810FGUV_FN 422, 1, 0
811FGUV_FN 444, 0, 0
812
813%endif ; ARCH_X86_64
814