xref: /aosp_15_r20/external/libdav1d/src/x86/filmgrain16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker%include "x86/filmgrain_common.asm"
29*c0909341SAndroid Build Coastguard Worker
30*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16
33*c0909341SAndroid Build Coastguard Workerscale_mask:    db -1, -1,  0, -1, -1, -1,  4, -1, -1, -1,  8, -1, -1, -1, 12, -1
34*c0909341SAndroid Build Coastguard Workerscale_shift:           dw   7,   7,   6,   6,   5,   5,   4,   4
35*c0909341SAndroid Build Coastguard Workerpw_27_17_17_27:        dw 108,  68,  68, 108,  27,  17,  17,  27
36*c0909341SAndroid Build Coastguard Workerpw_23_22:              dw  92,  88,   0, 128,  23,  22,   0,  32
37*c0909341SAndroid Build Coastguard Workerfg_min:        times 2 dw 0
38*c0909341SAndroid Build Coastguard Worker               times 2 dw 64
39*c0909341SAndroid Build Coastguard Worker               times 2 dw 256
40*c0909341SAndroid Build Coastguard Workerfg_max:        times 2 dw 1023
41*c0909341SAndroid Build Coastguard Worker               times 2 dw 4095
42*c0909341SAndroid Build Coastguard Worker               times 2 dw 960
43*c0909341SAndroid Build Coastguard Worker               times 2 dw 3840
44*c0909341SAndroid Build Coastguard Worker               times 2 dw 940
45*c0909341SAndroid Build Coastguard Worker               times 2 dw 3760
46*c0909341SAndroid Build Coastguard Workerscale_rnd:             dd 64
47*c0909341SAndroid Build Coastguard Worker                       dd 16
48*c0909341SAndroid Build Coastguard Workeruv_offset_mul:         dd 256
49*c0909341SAndroid Build Coastguard Worker                       dd 1024
50*c0909341SAndroid Build Coastguard Workerpb_8_9_0_1:            db 8, 9, 0, 1
51*c0909341SAndroid Build Coastguard Worker
52*c0909341SAndroid Build Coastguard Workercextern pb_0to63
53*c0909341SAndroid Build Coastguard Worker
54*c0909341SAndroid Build Coastguard WorkerSECTION .text
55*c0909341SAndroid Build Coastguard Worker
56*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
57*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \
58*c0909341SAndroid Build Coastguard Worker                                      grain_lut, offx, sby, see, offy, src_bak
59*c0909341SAndroid Build Coastguard Worker%define base r11-fg_min
60*c0909341SAndroid Build Coastguard Worker    lea             r11, [fg_min]
61*c0909341SAndroid Build Coastguard Worker    mov             r6d, r9m    ; bdmax
62*c0909341SAndroid Build Coastguard Worker    mov             r9d, [fg_dataq+FGData.clip_to_restricted_range]
63*c0909341SAndroid Build Coastguard Worker    mov             r7d, [fg_dataq+FGData.scaling_shift]
64*c0909341SAndroid Build Coastguard Worker    mov            sbyd, sbym
65*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m6, r9m
66*c0909341SAndroid Build Coastguard Worker    shr             r6d, 11     ; is_12bpc
67*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4  m7, [base+scale_mask]
68*c0909341SAndroid Build Coastguard Worker    shlx           r10d, r9d, r6d
69*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m10, [base+scale_shift+r7*4-32]
70*c0909341SAndroid Build Coastguard Worker    lea             r9d, [r6+r9*4]
71*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m8, [base+fg_min+r10*4]
72*c0909341SAndroid Build Coastguard Worker    kxnorw           k1, k1, k1 ; 0xffff
73*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m9, [base+fg_max+r9*4]
74*c0909341SAndroid Build Coastguard Worker    mov             r12, 0xeeeeeeeeeeeeeeee
75*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m19, [base+scale_rnd+r6*4]
76*c0909341SAndroid Build Coastguard Worker    kshiftrb         k2, k1, 4  ; 0xf
77*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   xm20, [base+pw_27_17_17_27+r6*8]
78*c0909341SAndroid Build Coastguard Worker    kmovq            k3, r12
79*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m11, [base+scale_shift+r6*8+4]
80*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
81*c0909341SAndroid Build Coastguard Worker    setnz           r7b
82*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m12, [base+pw_27_17_17_27+r6*8+0]
83*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m13, [base+pw_27_17_17_27+r6*8+4]
84*c0909341SAndroid Build Coastguard Worker    test            r7b, [fg_dataq+FGData.overlap_flag]
85*c0909341SAndroid Build Coastguard Worker    jnz .v_overlap
86*c0909341SAndroid Build Coastguard Worker
87*c0909341SAndroid Build Coastguard Worker    imul           seed, sbyd, (173 << 24) | 37
88*c0909341SAndroid Build Coastguard Worker    add            seed, (105 << 24) | 178
89*c0909341SAndroid Build Coastguard Worker    rorx           seed, seed, 24
90*c0909341SAndroid Build Coastguard Worker    movzx          seed, seew
91*c0909341SAndroid Build Coastguard Worker    xor            seed, [fg_dataq+FGData.seed]
92*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq*2]
93*c0909341SAndroid Build Coastguard Worker    neg              wq
94*c0909341SAndroid Build Coastguard Worker    sub            dstq, srcq
95*c0909341SAndroid Build Coastguard Worker
96*c0909341SAndroid Build Coastguard Worker.loop_x:
97*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
98*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4
99*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
100*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
101*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d                 ; updated seed
102*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
103*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
104*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
105*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
106*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
107*c0909341SAndroid Build Coastguard Worker
108*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
109*c0909341SAndroid Build Coastguard Worker                sby, see, offxy, src_bak
110*c0909341SAndroid Build Coastguard Worker
111*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
112*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
113*c0909341SAndroid Build Coastguard Worker.loop_y:
114*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+offxyq*2+82*0]
115*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+offxyq*2+82*2]
116*c0909341SAndroid Build Coastguard Worker    call .add_noise
117*c0909341SAndroid Build Coastguard Worker    sub              hb, 2
118*c0909341SAndroid Build Coastguard Worker    jg .loop_y
119*c0909341SAndroid Build Coastguard Worker    add              wq, 32
120*c0909341SAndroid Build Coastguard Worker    jge .end
121*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq*2]
122*c0909341SAndroid Build Coastguard Worker    cmp byte [fg_dataq+FGData.overlap_flag], 0
123*c0909341SAndroid Build Coastguard Worker    je .loop_x
124*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
125*c0909341SAndroid Build Coastguard Worker    jnz .hv_overlap
126*c0909341SAndroid Build Coastguard Worker
127*c0909341SAndroid Build Coastguard Worker    ; horizontal overlap (without vertical overlap)
128*c0909341SAndroid Build Coastguard Worker.loop_x_h_overlap:
129*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
130*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4
131*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
132*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
133*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d                 ; updated seed
134*c0909341SAndroid Build Coastguard Worker
135*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
136*c0909341SAndroid Build Coastguard Worker                sby, see, offy, src_bak, left_offxy
137*c0909341SAndroid Build Coastguard Worker
138*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyq+73]          ; previous column's offy*stride+offx
139*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
140*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
141*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
142*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
143*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
144*c0909341SAndroid Build Coastguard Worker
145*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
146*c0909341SAndroid Build Coastguard Worker                sby, see, offxy, src_bak, left_offxy
147*c0909341SAndroid Build Coastguard Worker
148*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
149*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
150*c0909341SAndroid Build Coastguard Worker.loop_y_h_overlap:
151*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+offxyq*2+82*0]
152*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+offxyq*2+82*2]
153*c0909341SAndroid Build Coastguard Worker    movd           xm17, [grain_lutq+left_offxyq*2-82*1]
154*c0909341SAndroid Build Coastguard Worker    pinsrd         xm17, [grain_lutq+left_offxyq*2+82*1], 1
155*c0909341SAndroid Build Coastguard Worker    punpckldq      xm16, xm4, xm5
156*c0909341SAndroid Build Coastguard Worker    punpcklwd      xm17, xm16
157*c0909341SAndroid Build Coastguard Worker    mova           xm16, xm19
158*c0909341SAndroid Build Coastguard Worker    vpdpwssd       xm16, xm20, xm17
159*c0909341SAndroid Build Coastguard Worker    psrad          xm16, 1
160*c0909341SAndroid Build Coastguard Worker    packssdw       xm16, xm16
161*c0909341SAndroid Build Coastguard Worker    vpsravw        xm16, xm11
162*c0909341SAndroid Build Coastguard Worker    vmovdqu8     m4{k2}, m16
163*c0909341SAndroid Build Coastguard Worker    vpalignr     m5{k2}, m16, m16, 4
164*c0909341SAndroid Build Coastguard Worker    call .add_noise
165*c0909341SAndroid Build Coastguard Worker    sub              hb, 2
166*c0909341SAndroid Build Coastguard Worker    jg .loop_y_h_overlap
167*c0909341SAndroid Build Coastguard Worker    add              wq, 32
168*c0909341SAndroid Build Coastguard Worker    jge .end
169*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq*2]
170*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
171*c0909341SAndroid Build Coastguard Worker    jnz .hv_overlap
172*c0909341SAndroid Build Coastguard Worker    jmp .loop_x_h_overlap
173*c0909341SAndroid Build Coastguard Worker
174*c0909341SAndroid Build Coastguard Worker.v_overlap:
175*c0909341SAndroid Build Coastguard Worker    movzx          sbyd, sbyb
176*c0909341SAndroid Build Coastguard Worker    imul           seed, [fg_dataq+FGData.seed], 0x00010001
177*c0909341SAndroid Build Coastguard Worker    imul            r7d, sbyd, 173 * 0x00010001
178*c0909341SAndroid Build Coastguard Worker    imul           sbyd, 37 * 0x01000100
179*c0909341SAndroid Build Coastguard Worker    add             r7d, (105 << 16) | 188
180*c0909341SAndroid Build Coastguard Worker    add            sbyd, (178 << 24) | (141 << 8)
181*c0909341SAndroid Build Coastguard Worker    and             r7d, 0x00ff00ff
182*c0909341SAndroid Build Coastguard Worker    and            sbyd, 0xff00ff00
183*c0909341SAndroid Build Coastguard Worker    xor            seed, r7d
184*c0909341SAndroid Build Coastguard Worker    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
185*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq*2]
186*c0909341SAndroid Build Coastguard Worker    neg              wq
187*c0909341SAndroid Build Coastguard Worker    sub            dstq, srcq
188*c0909341SAndroid Build Coastguard Worker
189*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
190*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
191*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
192*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
193*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of top_seed
194*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
195*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
196*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
197*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of cur_seed
198*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
199*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
200*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
201*c0909341SAndroid Build Coastguard Worker
202*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
203*c0909341SAndroid Build Coastguard Worker                sby, see, offy, src_bak, _, top_offxy
204*c0909341SAndroid Build Coastguard Worker
205*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
206*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
207*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
208*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
209*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
210*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
211*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
212*c0909341SAndroid Build Coastguard Worker
213*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
214*c0909341SAndroid Build Coastguard Worker                sby, see, offxy, src_bak, _, top_offxy
215*c0909341SAndroid Build Coastguard Worker
216*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
217*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
218*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
219*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
220*c0909341SAndroid Build Coastguard Worker
221*c0909341SAndroid Build Coastguard Worker    movu            m16, [grain_lutq+offxyq*2+82*0]
222*c0909341SAndroid Build Coastguard Worker    movu             m0, [grain_lutq+top_offxyq*2+82*0]
223*c0909341SAndroid Build Coastguard Worker    movu            m17, [grain_lutq+offxyq*2+82*2]
224*c0909341SAndroid Build Coastguard Worker    movu             m1, [grain_lutq+top_offxyq*2+82*2]
225*c0909341SAndroid Build Coastguard Worker    punpckhwd        m4, m0, m16
226*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m16
227*c0909341SAndroid Build Coastguard Worker    punpckhwd        m5, m1, m17
228*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m17
229*c0909341SAndroid Build Coastguard Worker    call .add_noise_v
230*c0909341SAndroid Build Coastguard Worker    sub              hb, 2
231*c0909341SAndroid Build Coastguard Worker    jg .loop_y
232*c0909341SAndroid Build Coastguard Worker    add              wq, 32
233*c0909341SAndroid Build Coastguard Worker    jge .end
234*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq*2]
235*c0909341SAndroid Build Coastguard Worker
236*c0909341SAndroid Build Coastguard Worker    ; since fg_dataq.overlap is guaranteed to be set, we never jump back
237*c0909341SAndroid Build Coastguard Worker    ; to .v_overlap, and instead always fall-through to .hv_overlap
238*c0909341SAndroid Build Coastguard Worker.hv_overlap:
239*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
240*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
241*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
242*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
243*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of top_seed
244*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
245*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
246*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
247*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of cur_seed
248*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
249*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
250*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
251*c0909341SAndroid Build Coastguard Worker
252*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
253*c0909341SAndroid Build Coastguard Worker                sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy
254*c0909341SAndroid Build Coastguard Worker
255*c0909341SAndroid Build Coastguard Worker    lea  topleft_offxyd, [top_offxyq+73]
256*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyq+73]
257*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
258*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
259*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
260*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
261*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
262*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
263*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
264*c0909341SAndroid Build Coastguard Worker
265*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
266*c0909341SAndroid Build Coastguard Worker                sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy
267*c0909341SAndroid Build Coastguard Worker
268*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
269*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
270*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
271*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
272*c0909341SAndroid Build Coastguard Worker
273*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+offxyq*2+82*0]
274*c0909341SAndroid Build Coastguard Worker    movu             m0, [grain_lutq+top_offxyq*2+82*0]
275*c0909341SAndroid Build Coastguard Worker    movd           xm17, [grain_lutq+left_offxyq*2-82*1]
276*c0909341SAndroid Build Coastguard Worker    pinsrd         xm17, [grain_lutq+topleft_offxyq*2-82*1], 1
277*c0909341SAndroid Build Coastguard Worker    movu             m2, [grain_lutq+offxyq*2+82*2]
278*c0909341SAndroid Build Coastguard Worker    movu             m1, [grain_lutq+top_offxyq*2+82*2]
279*c0909341SAndroid Build Coastguard Worker    movd           xm18, [grain_lutq+left_offxyq*2+82*1]
280*c0909341SAndroid Build Coastguard Worker    pinsrd         xm18, [grain_lutq+topleft_offxyq*2+82*1], 1
281*c0909341SAndroid Build Coastguard Worker    punpckldq      xm16, xm5, xm0
282*c0909341SAndroid Build Coastguard Worker    punpcklwd      xm17, xm16
283*c0909341SAndroid Build Coastguard Worker    mova           xm16, xm19
284*c0909341SAndroid Build Coastguard Worker    vpdpwssd       xm16, xm20, xm17
285*c0909341SAndroid Build Coastguard Worker    punpckldq      xm17, xm2, xm1
286*c0909341SAndroid Build Coastguard Worker    punpcklwd      xm18, xm17
287*c0909341SAndroid Build Coastguard Worker    mova           xm17, xm19
288*c0909341SAndroid Build Coastguard Worker    vpdpwssd       xm17, xm20, xm18
289*c0909341SAndroid Build Coastguard Worker    punpckhwd        m4, m0, m5
290*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m5
291*c0909341SAndroid Build Coastguard Worker    punpckhwd        m5, m1, m2
292*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m2
293*c0909341SAndroid Build Coastguard Worker    psrad          xm16, 1
294*c0909341SAndroid Build Coastguard Worker    psrad          xm17, 1
295*c0909341SAndroid Build Coastguard Worker    packssdw       xm16, xm17
296*c0909341SAndroid Build Coastguard Worker    vpsravw        xm16, xm11
297*c0909341SAndroid Build Coastguard Worker    vpshuflw     m0{k2}, m16, q1302
298*c0909341SAndroid Build Coastguard Worker    punpckhqdq     xm16, xm16
299*c0909341SAndroid Build Coastguard Worker    vpshuflw     m1{k2}, m16, q1302
300*c0909341SAndroid Build Coastguard Worker    call .add_noise_v
301*c0909341SAndroid Build Coastguard Worker    sub              hb, 2
302*c0909341SAndroid Build Coastguard Worker    jg .loop_y_h_overlap
303*c0909341SAndroid Build Coastguard Worker    add              wq, 32
304*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq*2]
305*c0909341SAndroid Build Coastguard Worker    jl .hv_overlap
306*c0909341SAndroid Build Coastguard Worker.end:
307*c0909341SAndroid Build Coastguard Worker    RET
308*c0909341SAndroid Build Coastguard WorkerALIGN function_align
309*c0909341SAndroid Build Coastguard Worker.add_noise_v:
310*c0909341SAndroid Build Coastguard Worker    mova             m2, m19
311*c0909341SAndroid Build Coastguard Worker    vpdpwssd         m2, m12, m4
312*c0909341SAndroid Build Coastguard Worker    mova             m3, m19
313*c0909341SAndroid Build Coastguard Worker    vpdpwssd         m3, m13, m5
314*c0909341SAndroid Build Coastguard Worker    mova             m4, m19
315*c0909341SAndroid Build Coastguard Worker    vpdpwssd         m4, m12, m0
316*c0909341SAndroid Build Coastguard Worker    mova             m5, m19
317*c0909341SAndroid Build Coastguard Worker    vpdpwssd         m5, m13, m1
318*c0909341SAndroid Build Coastguard Worker    REPX   {psrad x, 1}, m2, m3, m4, m5
319*c0909341SAndroid Build Coastguard Worker    packssdw         m4, m2
320*c0909341SAndroid Build Coastguard Worker    packssdw         m5, m3
321*c0909341SAndroid Build Coastguard Worker    vpsravw          m4, m11
322*c0909341SAndroid Build Coastguard Worker    vpsravw          m5, m11
323*c0909341SAndroid Build Coastguard Worker.add_noise:
324*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq+strideq*0]
325*c0909341SAndroid Build Coastguard Worker    mova             m1, [srcq+strideq*1]
326*c0909341SAndroid Build Coastguard Worker    kmovw            k4, k1
327*c0909341SAndroid Build Coastguard Worker    pand            m16, m6, m0
328*c0909341SAndroid Build Coastguard Worker    psrld            m3, m0, 16
329*c0909341SAndroid Build Coastguard Worker    vpgatherdd   m2{k4}, [scalingq+m16]
330*c0909341SAndroid Build Coastguard Worker    vpcmpud          k4, m3, m6, 2 ; px <= bdmax
331*c0909341SAndroid Build Coastguard Worker    vpgatherdd  m16{k4}, [scalingq+m3]
332*c0909341SAndroid Build Coastguard Worker    kmovw            k4, k1
333*c0909341SAndroid Build Coastguard Worker    pand            m17, m6, m1
334*c0909341SAndroid Build Coastguard Worker    vpgatherdd   m3{k4}, [scalingq+m17]
335*c0909341SAndroid Build Coastguard Worker    vpshufb      m2{k3}, m16, m7
336*c0909341SAndroid Build Coastguard Worker    psrld           m16, m1, 16
337*c0909341SAndroid Build Coastguard Worker    vpcmpud          k4, m16, m6, 2
338*c0909341SAndroid Build Coastguard Worker    vpgatherdd  m17{k4}, [scalingq+m16]
339*c0909341SAndroid Build Coastguard Worker    vpshufb      m3{k3}, m17, m7
340*c0909341SAndroid Build Coastguard Worker    vpsllvw          m2, m10
341*c0909341SAndroid Build Coastguard Worker    vpsllvw          m3, m10
342*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m2
343*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m3
344*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*4
345*c0909341SAndroid Build Coastguard Worker    paddw            m0, m4
346*c0909341SAndroid Build Coastguard Worker    paddw            m1, m5
347*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m8
348*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m8
349*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m9
350*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m9
351*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
352*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
353*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m1
354*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
355*c0909341SAndroid Build Coastguard Worker    ret
356*c0909341SAndroid Build Coastguard Worker
357*c0909341SAndroid Build Coastguard Worker%macro FGUV_FN 3 ; name, ss_hor, ss_ver
358*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \
359*c0909341SAndroid Build Coastguard Worker                                           grain_lut, h, sby, luma, lstride, uv_pl, is_id
360*c0909341SAndroid Build Coastguard Worker%define base r12-fg_min
361*c0909341SAndroid Build Coastguard Worker    lea             r12, [fg_min]
362*c0909341SAndroid Build Coastguard Worker    mov             r9d, r13m            ; bdmax
363*c0909341SAndroid Build Coastguard Worker    mov             r7d, [fg_dataq+FGData.scaling_shift]
364*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
365*c0909341SAndroid Build Coastguard Worker    mov            r11d, is_idm
366*c0909341SAndroid Build Coastguard Worker    kxnorw           k1, k1, k1          ; 0xffff
367*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m5, r13m
368*c0909341SAndroid Build Coastguard Worker    mov             r13, 0xeeeeeeeeeeeeeeee
369*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4  m6, [base+scale_mask]
370*c0909341SAndroid Build Coastguard Worker    shr             r9d, 11              ; is_12bpc
371*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m7, [base+scale_shift+r7*4-32]
372*c0909341SAndroid Build Coastguard Worker    shlx           r10d, r6d, r9d
373*c0909341SAndroid Build Coastguard Worker    mov            sbyd, sbym
374*c0909341SAndroid Build Coastguard Worker    shlx            r6d, r6d, r11d
375*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m8, [base+fg_min+r10*4]
376*c0909341SAndroid Build Coastguard Worker    lea             r6d, [r9+r6*2]
377*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m9, [base+fg_max+r6*4]
378*c0909341SAndroid Build Coastguard Worker    kmovq            k2, r13
379*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m20, [base+scale_rnd+r9*4]
380*c0909341SAndroid Build Coastguard Worker    packssdw         m4, m5, m5
381*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m21, [base+scale_shift+r9*8+4]
382*c0909341SAndroid Build Coastguard Worker%if %2
383*c0909341SAndroid Build Coastguard Worker    mova            m12, [pb_0to63] ; pw_even
384*c0909341SAndroid Build Coastguard Worker    mov            r13d, 0x0101
385*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m10, [base+pw_23_22+r9*8]
386*c0909341SAndroid Build Coastguard Worker    kmovw            k3, r13d
387*c0909341SAndroid Build Coastguard Worker%if %3
388*c0909341SAndroid Build Coastguard Worker    pshufd          m11, m10, q0000
389*c0909341SAndroid Build Coastguard Worker%else
390*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   ym16, [base+pw_27_17_17_27+r9*8+0]
391*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m11, [base+pw_27_17_17_27+r9*8+4]
392*c0909341SAndroid Build Coastguard Worker    vmovdqu16   m11{k1}, m16
393*c0909341SAndroid Build Coastguard Worker%endif
394*c0909341SAndroid Build Coastguard Worker    psrlw           m13, m12, 8          ; pw_odd
395*c0909341SAndroid Build Coastguard Worker%else
396*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m10, [base+pw_27_17_17_27+r9*8]
397*c0909341SAndroid Build Coastguard Worker    kshiftrb         k3, k1, 7           ; 0x01
398*c0909341SAndroid Build Coastguard Worker    kshiftrb         k4, k1, 4           ; 0x0f
399*c0909341SAndroid Build Coastguard Worker    pshufd          m11, m10, q0000
400*c0909341SAndroid Build Coastguard Worker%endif
401*c0909341SAndroid Build Coastguard Worker    mov        lstrideq, r10mp
402*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
403*c0909341SAndroid Build Coastguard Worker    setnz           r7b
404*c0909341SAndroid Build Coastguard Worker    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
405*c0909341SAndroid Build Coastguard Worker    jne .csfl
406*c0909341SAndroid Build Coastguard Worker
407*c0909341SAndroid Build Coastguard Worker%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
408*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
409*c0909341SAndroid Build Coastguard Worker                _, sby, see, lstride
410*c0909341SAndroid Build Coastguard Worker
411*c0909341SAndroid Build Coastguard Worker%if %1
412*c0909341SAndroid Build Coastguard Worker    mov             r6d, r11m
413*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m0, [base+uv_offset_mul+r9*4]
414*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m1, [base+pb_8_9_0_1]
415*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m14, [fg_dataq+FGData.uv_offset+r6*4]
416*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4]
417*c0909341SAndroid Build Coastguard Worker    pmaddwd         m14, m0
418*c0909341SAndroid Build Coastguard Worker    pshufb          m15, m1 ; { uv_luma_mult, uv_mult }
419*c0909341SAndroid Build Coastguard Worker%endif
420*c0909341SAndroid Build Coastguard Worker    test            r7b, [fg_dataq+FGData.overlap_flag]
421*c0909341SAndroid Build Coastguard Worker    jnz %%v_overlap
422*c0909341SAndroid Build Coastguard Worker
423*c0909341SAndroid Build Coastguard Worker    imul           seed, sbyd, (173 << 24) | 37
424*c0909341SAndroid Build Coastguard Worker    add            seed, (105 << 24) | 178
425*c0909341SAndroid Build Coastguard Worker    rorx           seed, seed, 24
426*c0909341SAndroid Build Coastguard Worker    movzx          seed, seew
427*c0909341SAndroid Build Coastguard Worker    xor            seed, [fg_dataq+FGData.seed]
428*c0909341SAndroid Build Coastguard Worker
429*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
430*c0909341SAndroid Build Coastguard Worker                offx, offy, see, lstride, luma
431*c0909341SAndroid Build Coastguard Worker
432*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
433*c0909341SAndroid Build Coastguard Worker    lea             r12, [srcq+wq*2]
434*c0909341SAndroid Build Coastguard Worker    lea             r13, [dstq+wq*2]
435*c0909341SAndroid Build Coastguard Worker    lea             r14, [lumaq+wq*(2<<%2)]
436*c0909341SAndroid Build Coastguard Worker    mov            r9mp, r12
437*c0909341SAndroid Build Coastguard Worker    mov           r10mp, r13
438*c0909341SAndroid Build Coastguard Worker    mov           r11mp, r14
439*c0909341SAndroid Build Coastguard Worker    neg              wq
440*c0909341SAndroid Build Coastguard Worker
441*c0909341SAndroid Build Coastguard Worker%%loop_x:
442*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
443*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4
444*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
445*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
446*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d               ; updated seed
447*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
448*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
449*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
450*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
451*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
452*c0909341SAndroid Build Coastguard Worker
453*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
454*c0909341SAndroid Build Coastguard Worker                h, offxy, see, lstride, luma
455*c0909341SAndroid Build Coastguard Worker
456*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
457*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
458*c0909341SAndroid Build Coastguard Worker%%loop_y:
459*c0909341SAndroid Build Coastguard Worker%if %2
460*c0909341SAndroid Build Coastguard Worker    movu           ym18, [grain_lutq+offxyq*2+82*0]
461*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
462*c0909341SAndroid Build Coastguard Worker    movu           ym19, [grain_lutq+offxyq*2+82*4]
463*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
464*c0909341SAndroid Build Coastguard Worker%else
465*c0909341SAndroid Build Coastguard Worker    movu            m18, [grain_lutq+offxyq*2+82*0]
466*c0909341SAndroid Build Coastguard Worker    movu            m19, [grain_lutq+offxyq*2+82*2]
467*c0909341SAndroid Build Coastguard Worker%endif
468*c0909341SAndroid Build Coastguard Worker    call %%add_noise
469*c0909341SAndroid Build Coastguard Worker    sub              hb, 2<<%2
470*c0909341SAndroid Build Coastguard Worker    jg %%loop_y
471*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
472*c0909341SAndroid Build Coastguard Worker    jge .end
473*c0909341SAndroid Build Coastguard Worker    mov            srcq, r9mp
474*c0909341SAndroid Build Coastguard Worker    mov            dstq, r10mp
475*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r11mp
476*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+wq*2]
477*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+wq*2]
478*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+wq*(2<<%2)]
479*c0909341SAndroid Build Coastguard Worker    cmp byte [fg_dataq+FGData.overlap_flag], 0
480*c0909341SAndroid Build Coastguard Worker    je %%loop_x
481*c0909341SAndroid Build Coastguard Worker    cmp       dword r8m, 0 ; sby
482*c0909341SAndroid Build Coastguard Worker    jne %%hv_overlap
483*c0909341SAndroid Build Coastguard Worker
484*c0909341SAndroid Build Coastguard Worker    ; horizontal overlap (without vertical overlap)
485*c0909341SAndroid Build Coastguard Worker%%loop_x_h_overlap:
486*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
487*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
488*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
489*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
490*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d               ; updated seed
491*c0909341SAndroid Build Coastguard Worker
492*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
493*c0909341SAndroid Build Coastguard Worker                offx, offy, see, lstride, luma, left_offxy
494*c0909341SAndroid Build Coastguard Worker
495*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyq+(32>>%2)]  ; previous column's offy*stride+offx
496*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
497*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
498*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
499*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
500*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
501*c0909341SAndroid Build Coastguard Worker
502*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
503*c0909341SAndroid Build Coastguard Worker                h, offxy, see, lstride, luma, left_offxy
504*c0909341SAndroid Build Coastguard Worker
505*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
506*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
507*c0909341SAndroid Build Coastguard Worker%%loop_y_h_overlap:
508*c0909341SAndroid Build Coastguard Worker%if %2
509*c0909341SAndroid Build Coastguard Worker    movu           ym18, [grain_lutq+offxyq*2+82*0]
510*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
511*c0909341SAndroid Build Coastguard Worker    movu           ym19, [grain_lutq+offxyq*2+82*4]
512*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
513*c0909341SAndroid Build Coastguard Worker    movd           xm16, [grain_lutq+left_offxyq*2+82*0]
514*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m16, [grain_lutq+left_offxyq*2+82*2], 2
515*c0909341SAndroid Build Coastguard Worker    movd           xm17, [grain_lutq+left_offxyq*2+82*4]
516*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m17, [grain_lutq+left_offxyq*2+82*6], 2
517*c0909341SAndroid Build Coastguard Worker    punpckldq       m16, m17
518*c0909341SAndroid Build Coastguard Worker    punpckldq       m17, m18, m19
519*c0909341SAndroid Build Coastguard Worker    punpcklwd       m16, m17
520*c0909341SAndroid Build Coastguard Worker    mova            m17, m20
521*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m17, m16, m10
522*c0909341SAndroid Build Coastguard Worker    psrad           m17, 1
523*c0909341SAndroid Build Coastguard Worker    packssdw        m17, m17
524*c0909341SAndroid Build Coastguard Worker    vpsravw         m17, m21
525*c0909341SAndroid Build Coastguard Worker%else
526*c0909341SAndroid Build Coastguard Worker    movu            m18, [grain_lutq+offxyq*2+82*0]
527*c0909341SAndroid Build Coastguard Worker    movu            m19, [grain_lutq+offxyq*2+82*2]
528*c0909341SAndroid Build Coastguard Worker    movd           xm16, [grain_lutq+left_offxyq*2+82*0]
529*c0909341SAndroid Build Coastguard Worker    pinsrd         xm16, [grain_lutq+left_offxyq*2+82*2], 1
530*c0909341SAndroid Build Coastguard Worker    punpckldq      xm17, xm18, xm19
531*c0909341SAndroid Build Coastguard Worker    punpcklwd      xm16, xm17
532*c0909341SAndroid Build Coastguard Worker    mova           xm17, xm20
533*c0909341SAndroid Build Coastguard Worker    vpdpwssd       xm17, xm16, xm10
534*c0909341SAndroid Build Coastguard Worker    psrad          xm17, 1
535*c0909341SAndroid Build Coastguard Worker    packssdw       xm17, xm17
536*c0909341SAndroid Build Coastguard Worker    vpsravw        xm17, xm21
537*c0909341SAndroid Build Coastguard Worker%endif
538*c0909341SAndroid Build Coastguard Worker    vmovdqa32   m18{k3}, m17
539*c0909341SAndroid Build Coastguard Worker    vpshufd     m19{k3}, m17, q0321
540*c0909341SAndroid Build Coastguard Worker    call %%add_noise
541*c0909341SAndroid Build Coastguard Worker    sub              hb, 2<<%2
542*c0909341SAndroid Build Coastguard Worker    jg %%loop_y_h_overlap
543*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
544*c0909341SAndroid Build Coastguard Worker    jge .end
545*c0909341SAndroid Build Coastguard Worker    mov            srcq, r9mp
546*c0909341SAndroid Build Coastguard Worker    mov            dstq, r10mp
547*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r11mp
548*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+wq*2]
549*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+wq*2]
550*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+wq*(2<<%2)]
551*c0909341SAndroid Build Coastguard Worker    cmp       dword r8m, 0 ; sby
552*c0909341SAndroid Build Coastguard Worker    jne %%hv_overlap
553*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_h_overlap
554*c0909341SAndroid Build Coastguard Worker
555*c0909341SAndroid Build Coastguard Worker%%v_overlap:
556*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
557*c0909341SAndroid Build Coastguard Worker                _, sby, see, lstride
558*c0909341SAndroid Build Coastguard Worker
559*c0909341SAndroid Build Coastguard Worker    movzx          sbyd, sbyb
560*c0909341SAndroid Build Coastguard Worker    imul           seed, [fg_dataq+FGData.seed], 0x00010001
561*c0909341SAndroid Build Coastguard Worker    imul            r7d, sbyd, 173 * 0x00010001
562*c0909341SAndroid Build Coastguard Worker    imul           sbyd, 37 * 0x01000100
563*c0909341SAndroid Build Coastguard Worker    add             r7d, (105 << 16) | 188
564*c0909341SAndroid Build Coastguard Worker    add            sbyd, (178 << 24) | (141 << 8)
565*c0909341SAndroid Build Coastguard Worker    and             r7d, 0x00ff00ff
566*c0909341SAndroid Build Coastguard Worker    and            sbyd, 0xff00ff00
567*c0909341SAndroid Build Coastguard Worker    xor            seed, r7d
568*c0909341SAndroid Build Coastguard Worker    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
569*c0909341SAndroid Build Coastguard Worker
570*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
571*c0909341SAndroid Build Coastguard Worker                offx, offy, see, lstride, luma, _, top_offxy
572*c0909341SAndroid Build Coastguard Worker
573*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
574*c0909341SAndroid Build Coastguard Worker    lea             r12, [srcq+wq*2]
575*c0909341SAndroid Build Coastguard Worker    lea             r13, [dstq+wq*2]
576*c0909341SAndroid Build Coastguard Worker    lea             r14, [lumaq+wq*(2<<%2)]
577*c0909341SAndroid Build Coastguard Worker    mov            r9mp, r12
578*c0909341SAndroid Build Coastguard Worker    mov           r10mp, r13
579*c0909341SAndroid Build Coastguard Worker    mov           r11mp, r14
580*c0909341SAndroid Build Coastguard Worker    neg              wq
581*c0909341SAndroid Build Coastguard Worker
582*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
583*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
584*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
585*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
586*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of top_seed
587*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
588*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
589*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
590*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of cur_seed
591*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
592*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
593*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
594*c0909341SAndroid Build Coastguard Worker
595*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
596*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
597*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
598*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
599*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
600*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
601*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
602*c0909341SAndroid Build Coastguard Worker
603*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
604*c0909341SAndroid Build Coastguard Worker                h, offxy, see, lstride, luma, _, top_offxy
605*c0909341SAndroid Build Coastguard Worker
606*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
607*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
608*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
609*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
610*c0909341SAndroid Build Coastguard Worker
611*c0909341SAndroid Build Coastguard Worker%if %3
612*c0909341SAndroid Build Coastguard Worker    movu           ym16, [grain_lutq+offxyq*2+82*0]
613*c0909341SAndroid Build Coastguard Worker    movu            ym1, [grain_lutq+top_offxyq*2+82*0]
614*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2]
615*c0909341SAndroid Build Coastguard Worker    movu           ym19, [grain_lutq+offxyq*2+82*4]
616*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
617*c0909341SAndroid Build Coastguard Worker    punpcklwd      ym17, ym1, ym16
618*c0909341SAndroid Build Coastguard Worker    punpckhwd       ym1, ym16
619*c0909341SAndroid Build Coastguard Worker%elif %2
620*c0909341SAndroid Build Coastguard Worker    movu           ym18, [grain_lutq+offxyq*2+82*0]
621*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
622*c0909341SAndroid Build Coastguard Worker    movu           ym17, [grain_lutq+top_offxyq*2+82*0]
623*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m17, [grain_lutq+top_offxyq*2+82*2], 1
624*c0909341SAndroid Build Coastguard Worker    movu           ym19, [grain_lutq+offxyq*2+82*4]
625*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
626*c0909341SAndroid Build Coastguard Worker    punpcklwd       m16, m17, m18
627*c0909341SAndroid Build Coastguard Worker    punpckhwd       m17, m18
628*c0909341SAndroid Build Coastguard Worker%else
629*c0909341SAndroid Build Coastguard Worker    movu            m18, [grain_lutq+offxyq*2+82*0]
630*c0909341SAndroid Build Coastguard Worker    movu            m19, [grain_lutq+top_offxyq*2+82*0]
631*c0909341SAndroid Build Coastguard Worker    movu             m2, [grain_lutq+offxyq*2+82*2]
632*c0909341SAndroid Build Coastguard Worker    movu            m16, [grain_lutq+top_offxyq*2+82*2]
633*c0909341SAndroid Build Coastguard Worker    punpckhwd        m1, m19, m18
634*c0909341SAndroid Build Coastguard Worker    punpcklwd       m19, m18
635*c0909341SAndroid Build Coastguard Worker    punpckhwd       m18, m2, m16
636*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m16
637*c0909341SAndroid Build Coastguard Worker%endif
638*c0909341SAndroid Build Coastguard Worker    call %%add_noise_v
639*c0909341SAndroid Build Coastguard Worker    sub              hb, 2<<%2
640*c0909341SAndroid Build Coastguard Worker    jg %%loop_y
641*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
642*c0909341SAndroid Build Coastguard Worker    jge .end
643*c0909341SAndroid Build Coastguard Worker    mov            srcq, r9mp
644*c0909341SAndroid Build Coastguard Worker    mov            dstq, r10mp
645*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r11mp
646*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+wq*2]
647*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+wq*2]
648*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+wq*(2<<%2)]
649*c0909341SAndroid Build Coastguard Worker
650*c0909341SAndroid Build Coastguard Worker    ; since fg_dataq.overlap is guaranteed to be set, we never jump back
651*c0909341SAndroid Build Coastguard Worker    ; to %%v_overlap, and instead always fall-through to %%hv_overlap
652*c0909341SAndroid Build Coastguard Worker%%hv_overlap:
653*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
654*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
655*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
656*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
657*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of top_seed
658*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
659*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
660*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
661*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of cur_seed
662*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
663*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
664*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
665*c0909341SAndroid Build Coastguard Worker
666*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
667*c0909341SAndroid Build Coastguard Worker                offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
668*c0909341SAndroid Build Coastguard Worker
669*c0909341SAndroid Build Coastguard Worker    lea  topleft_offxyq, [top_offxyq+(32>>%2)]
670*c0909341SAndroid Build Coastguard Worker    lea     left_offxyq, [offyq+(32>>%2)]
671*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
672*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
673*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
674*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
675*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
676*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
677*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
678*c0909341SAndroid Build Coastguard Worker
679*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
680*c0909341SAndroid Build Coastguard Worker                h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
681*c0909341SAndroid Build Coastguard Worker
682*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
683*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
684*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
685*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
686*c0909341SAndroid Build Coastguard Worker
687*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
688*c0909341SAndroid Build Coastguard Worker%if %2
689*c0909341SAndroid Build Coastguard Worker    movd           xm16, [grain_lutq+left_offxyq*2+82*0]
690*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m16, [grain_lutq+left_offxyq*2+82*2], 2
691*c0909341SAndroid Build Coastguard Worker    movd           xm17, [grain_lutq+left_offxyq*2+82*4]
692*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m17, [grain_lutq+left_offxyq*2+82*6], 2
693*c0909341SAndroid Build Coastguard Worker    movu           ym18, [grain_lutq+offxyq*2+82*0]
694*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
695*c0909341SAndroid Build Coastguard Worker    movu           ym19, [grain_lutq+offxyq*2+82*4]
696*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
697*c0909341SAndroid Build Coastguard Worker    punpckldq       m16, m17
698*c0909341SAndroid Build Coastguard Worker    punpckldq       m17, m18, m19
699*c0909341SAndroid Build Coastguard Worker    punpcklwd       m16, m17
700*c0909341SAndroid Build Coastguard Worker    movu            ym1, [grain_lutq+top_offxyq*2+82*0]
701*c0909341SAndroid Build Coastguard Worker    movd           xm17, [grain_lutq+topleft_offxyq*2+82*0]
702*c0909341SAndroid Build Coastguard Worker    mova             m0, m20
703*c0909341SAndroid Build Coastguard Worker    vpdpwssd         m0, m16, m10
704*c0909341SAndroid Build Coastguard Worker%if %3
705*c0909341SAndroid Build Coastguard Worker    punpcklwd      xm17, xm1
706*c0909341SAndroid Build Coastguard Worker    mova           xm16, xm20
707*c0909341SAndroid Build Coastguard Worker    vpdpwssd       xm16, xm17, xm10
708*c0909341SAndroid Build Coastguard Worker    psrad          xm16, 1
709*c0909341SAndroid Build Coastguard Worker%else
710*c0909341SAndroid Build Coastguard Worker    vinserti32x8     m1, [grain_lutq+top_offxyq*2+82*2], 1
711*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m17, [grain_lutq+topleft_offxyq*2+82*2], 2
712*c0909341SAndroid Build Coastguard Worker    punpcklwd       m17, m1
713*c0909341SAndroid Build Coastguard Worker    mova            m16, m20
714*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m16, m17, m10
715*c0909341SAndroid Build Coastguard Worker    psrad           m16, 1
716*c0909341SAndroid Build Coastguard Worker%endif
717*c0909341SAndroid Build Coastguard Worker    psrad            m0, 1
718*c0909341SAndroid Build Coastguard Worker    packssdw         m0, m16
719*c0909341SAndroid Build Coastguard Worker    vpsravw          m0, m21
720*c0909341SAndroid Build Coastguard Worker    vmovdqa32   m18{k3}, m0
721*c0909341SAndroid Build Coastguard Worker    vpshufd     m19{k3}, m0, q0321
722*c0909341SAndroid Build Coastguard Worker%if %3
723*c0909341SAndroid Build Coastguard Worker    vpunpckhdq  ym1{k3}, ym0, ym0
724*c0909341SAndroid Build Coastguard Worker    punpcklwd      ym17, ym1, ym18
725*c0909341SAndroid Build Coastguard Worker    punpckhwd       ym1, ym18
726*c0909341SAndroid Build Coastguard Worker%else
727*c0909341SAndroid Build Coastguard Worker    vpunpckhdq   m1{k3}, m0, m0
728*c0909341SAndroid Build Coastguard Worker    punpcklwd       m16, m1, m18
729*c0909341SAndroid Build Coastguard Worker    punpckhwd       m17, m1, m18
730*c0909341SAndroid Build Coastguard Worker%endif
731*c0909341SAndroid Build Coastguard Worker%else
732*c0909341SAndroid Build Coastguard Worker    movu            m18, [grain_lutq+offxyq*2+82*0]
733*c0909341SAndroid Build Coastguard Worker    movu            m19, [grain_lutq+top_offxyq*2+82*0]
734*c0909341SAndroid Build Coastguard Worker    movd           xm17, [grain_lutq+left_offxyq*2+82*0]
735*c0909341SAndroid Build Coastguard Worker    pinsrd         xm17, [grain_lutq+topleft_offxyq*2+82*0], 1
736*c0909341SAndroid Build Coastguard Worker    punpckldq      xm16, xm18, xm19
737*c0909341SAndroid Build Coastguard Worker    punpcklwd      xm17, xm16
738*c0909341SAndroid Build Coastguard Worker    movu             m2, [grain_lutq+offxyq*2+82*2]
739*c0909341SAndroid Build Coastguard Worker    movu             m0, [grain_lutq+top_offxyq*2+82*2]
740*c0909341SAndroid Build Coastguard Worker    movd           xm16, [grain_lutq+left_offxyq*2+82*2]
741*c0909341SAndroid Build Coastguard Worker    pinsrd         xm16, [grain_lutq+topleft_offxyq*2+82*2], 1
742*c0909341SAndroid Build Coastguard Worker    punpckldq       xm1, xm2, xm0
743*c0909341SAndroid Build Coastguard Worker    punpcklwd       xm1, xm16, xm1
744*c0909341SAndroid Build Coastguard Worker    mova           xm16, xm20
745*c0909341SAndroid Build Coastguard Worker    vpdpwssd       xm16, xm17, xm10
746*c0909341SAndroid Build Coastguard Worker    mova           xm17, xm20
747*c0909341SAndroid Build Coastguard Worker    vpdpwssd       xm17, xm1, xm10
748*c0909341SAndroid Build Coastguard Worker    punpckhwd        m1, m19, m18
749*c0909341SAndroid Build Coastguard Worker    punpcklwd       m19, m18
750*c0909341SAndroid Build Coastguard Worker    punpckhwd       m18, m2, m0
751*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m0
752*c0909341SAndroid Build Coastguard Worker    psrad          xm16, 1
753*c0909341SAndroid Build Coastguard Worker    psrad          xm17, 1
754*c0909341SAndroid Build Coastguard Worker    packssdw       xm16, xm17
755*c0909341SAndroid Build Coastguard Worker    vpsravw        xm16, xm21
756*c0909341SAndroid Build Coastguard Worker    vpshuflw    m19{k4}, m16, q1302
757*c0909341SAndroid Build Coastguard Worker    punpckhqdq     xm16, xm16
758*c0909341SAndroid Build Coastguard Worker    vpshuflw     m2{k4}, m16, q3120
759*c0909341SAndroid Build Coastguard Worker%endif
760*c0909341SAndroid Build Coastguard Worker    call %%add_noise_v
761*c0909341SAndroid Build Coastguard Worker    sub              hb, 2<<%2
762*c0909341SAndroid Build Coastguard Worker    jg %%loop_y_h_overlap
763*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
764*c0909341SAndroid Build Coastguard Worker    jge .end
765*c0909341SAndroid Build Coastguard Worker    mov            srcq, r9mp
766*c0909341SAndroid Build Coastguard Worker    mov            dstq, r10mp
767*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r11mp
768*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+wq*2]
769*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+wq*2]
770*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+wq*(2<<%2)]
771*c0909341SAndroid Build Coastguard Worker    jmp %%hv_overlap
772*c0909341SAndroid Build Coastguard Worker
773*c0909341SAndroid Build Coastguard WorkerALIGN function_align
774*c0909341SAndroid Build Coastguard Worker%%add_noise_v:
775*c0909341SAndroid Build Coastguard Worker%if %3
776*c0909341SAndroid Build Coastguard Worker    mova           ym16, ym20
777*c0909341SAndroid Build Coastguard Worker    vpdpwssd       ym16, ym17, ym11
778*c0909341SAndroid Build Coastguard Worker    mova           ym17, ym20
779*c0909341SAndroid Build Coastguard Worker    vpdpwssd       ym17, ym1, ym11
780*c0909341SAndroid Build Coastguard Worker    psrad          ym16, 1
781*c0909341SAndroid Build Coastguard Worker    psrad          ym17, 1
782*c0909341SAndroid Build Coastguard Worker    packssdw       ym16, ym17
783*c0909341SAndroid Build Coastguard Worker    vpsravw     m18{k1}, m16, m21
784*c0909341SAndroid Build Coastguard Worker%elif %2
785*c0909341SAndroid Build Coastguard Worker    mova            m18, m20
786*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m18, m16, m11
787*c0909341SAndroid Build Coastguard Worker    mova            m16, m20
788*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m16, m17, m11
789*c0909341SAndroid Build Coastguard Worker    psrad           m18, 1
790*c0909341SAndroid Build Coastguard Worker    psrad           m16, 1
791*c0909341SAndroid Build Coastguard Worker    packssdw        m18, m16
792*c0909341SAndroid Build Coastguard Worker    vpsravw         m18, m21
793*c0909341SAndroid Build Coastguard Worker%else
794*c0909341SAndroid Build Coastguard Worker    mova            m16, m20
795*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m16, m1, m11
796*c0909341SAndroid Build Coastguard Worker    mova            m17, m20
797*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m17, m18, m11
798*c0909341SAndroid Build Coastguard Worker    mova            m18, m20
799*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m18, m19, m11
800*c0909341SAndroid Build Coastguard Worker    mova            m19, m20
801*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m19, m2, m11
802*c0909341SAndroid Build Coastguard Worker    REPX   {psrad x, 1}, m16, m17, m18, m19
803*c0909341SAndroid Build Coastguard Worker    packssdw        m18, m16
804*c0909341SAndroid Build Coastguard Worker    packssdw        m19, m17
805*c0909341SAndroid Build Coastguard Worker    vpsravw         m18, m21
806*c0909341SAndroid Build Coastguard Worker    vpsravw         m19, m21
807*c0909341SAndroid Build Coastguard Worker%endif
808*c0909341SAndroid Build Coastguard Worker%%add_noise:
809*c0909341SAndroid Build Coastguard Worker%if %2
810*c0909341SAndroid Build Coastguard Worker    mova             m2, [lumaq+lstrideq*(0<<%3)]
811*c0909341SAndroid Build Coastguard Worker    mova             m0, [lumaq+lstrideq*(1<<%3)]
812*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
813*c0909341SAndroid Build Coastguard Worker    mova             m3, [lumaq+lstrideq*(0<<%3)]
814*c0909341SAndroid Build Coastguard Worker    mova             m1, [lumaq+lstrideq*(1<<%3)]
815*c0909341SAndroid Build Coastguard Worker    mova            m16, m12
816*c0909341SAndroid Build Coastguard Worker    vpermi2w        m16, m2, m0
817*c0909341SAndroid Build Coastguard Worker    vpermt2w         m2, m13, m0
818*c0909341SAndroid Build Coastguard Worker    mova            m17, m12
819*c0909341SAndroid Build Coastguard Worker    vpermi2w        m17, m3, m1
820*c0909341SAndroid Build Coastguard Worker    vpermt2w         m3, m13, m1
821*c0909341SAndroid Build Coastguard Worker    pavgw            m2, m16
822*c0909341SAndroid Build Coastguard Worker    pavgw            m3, m17
823*c0909341SAndroid Build Coastguard Worker%elif %1
824*c0909341SAndroid Build Coastguard Worker    mova             m2, [lumaq+lstrideq*0]
825*c0909341SAndroid Build Coastguard Worker    mova             m3, [lumaq+lstrideq*1]
826*c0909341SAndroid Build Coastguard Worker%endif
827*c0909341SAndroid Build Coastguard Worker%if %2
828*c0909341SAndroid Build Coastguard Worker    mova           ym16, [srcq+strideq*0]
829*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m16, [srcq+strideq*1], 1
830*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+strideq*2]
831*c0909341SAndroid Build Coastguard Worker%else
832*c0909341SAndroid Build Coastguard Worker    mova            m16, [srcq+strideq*0]
833*c0909341SAndroid Build Coastguard Worker%endif
834*c0909341SAndroid Build Coastguard Worker%if %1
835*c0909341SAndroid Build Coastguard Worker    punpckhwd       m17, m2, m16
836*c0909341SAndroid Build Coastguard Worker    mova             m0, m14
837*c0909341SAndroid Build Coastguard Worker    vpdpwssd         m0, m17, m15
838*c0909341SAndroid Build Coastguard Worker    punpcklwd       m17, m2, m16
839*c0909341SAndroid Build Coastguard Worker    mova             m2, m14
840*c0909341SAndroid Build Coastguard Worker    vpdpwssd         m2, m17, m15
841*c0909341SAndroid Build Coastguard Worker%endif
842*c0909341SAndroid Build Coastguard Worker%if %2
843*c0909341SAndroid Build Coastguard Worker    mova           ym17, [srcq+strideq*0]
844*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m17, [srcq+strideq*1], 1
845*c0909341SAndroid Build Coastguard Worker%else
846*c0909341SAndroid Build Coastguard Worker    mova            m17, [srcq+strideq*1]
847*c0909341SAndroid Build Coastguard Worker%endif
848*c0909341SAndroid Build Coastguard Worker%if %1
849*c0909341SAndroid Build Coastguard Worker    psrad            m0, 6
850*c0909341SAndroid Build Coastguard Worker    psrad            m2, 6
851*c0909341SAndroid Build Coastguard Worker    packusdw         m2, m0
852*c0909341SAndroid Build Coastguard Worker    punpckhwd        m0, m3, m17
853*c0909341SAndroid Build Coastguard Worker    mova             m1, m14
854*c0909341SAndroid Build Coastguard Worker    vpdpwssd         m1, m15, m0
855*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m3, m17
856*c0909341SAndroid Build Coastguard Worker    mova             m3, m14
857*c0909341SAndroid Build Coastguard Worker    vpdpwssd         m3, m15, m0
858*c0909341SAndroid Build Coastguard Worker    psrad            m1, 6
859*c0909341SAndroid Build Coastguard Worker    psrad            m3, 6
860*c0909341SAndroid Build Coastguard Worker    packusdw         m3, m1
861*c0909341SAndroid Build Coastguard Worker    pminuw           m2, m4
862*c0909341SAndroid Build Coastguard Worker    pminuw           m3, m4
863*c0909341SAndroid Build Coastguard Worker
864*c0909341SAndroid Build Coastguard Worker.add_noise_main:
865*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
866*c0909341SAndroid Build Coastguard Worker    kmovw            k5, k1
867*c0909341SAndroid Build Coastguard Worker    pand             m1, m5, m2
868*c0909341SAndroid Build Coastguard Worker    vpgatherdd   m0{k5}, [scalingq+m1]
869*c0909341SAndroid Build Coastguard Worker    kmovw            k5, k1
870*c0909341SAndroid Build Coastguard Worker    psrld            m2, 16
871*c0909341SAndroid Build Coastguard Worker    vpgatherdd   m1{k5}, [scalingq+m2]
872*c0909341SAndroid Build Coastguard Worker    vpshufb      m0{k2}, m1, m6
873*c0909341SAndroid Build Coastguard Worker    kmovw            k5, k1
874*c0909341SAndroid Build Coastguard Worker    psrld            m1, m3, 16
875*c0909341SAndroid Build Coastguard Worker    vpgatherdd   m2{k5}, [scalingq+m1]
876*c0909341SAndroid Build Coastguard Worker    kmovw            k5, k1
877*c0909341SAndroid Build Coastguard Worker    pand             m3, m5
878*c0909341SAndroid Build Coastguard Worker    vpgatherdd   m1{k5}, [scalingq+m3]
879*c0909341SAndroid Build Coastguard Worker    vpshufb      m1{k2}, m2, m6
880*c0909341SAndroid Build Coastguard Worker
881*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
882*c0909341SAndroid Build Coastguard Worker    vpsllvw          m0, m7
883*c0909341SAndroid Build Coastguard Worker    vpsllvw          m1, m7
884*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m18, m0
885*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m19, m1
886*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*(4<<%2)
887*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
888*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+strideq*2]
889*c0909341SAndroid Build Coastguard Worker    paddw           m16, m18
890*c0909341SAndroid Build Coastguard Worker    paddw           m17, m19
891*c0909341SAndroid Build Coastguard Worker    pmaxsw          m16, m8
892*c0909341SAndroid Build Coastguard Worker    pmaxsw          m17, m8
893*c0909341SAndroid Build Coastguard Worker    pminsw          m16, m9
894*c0909341SAndroid Build Coastguard Worker    pminsw          m17, m9
895*c0909341SAndroid Build Coastguard Worker%if %2
896*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym16
897*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m16, 1
898*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+strideq*2]
899*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym17
900*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m17, 1
901*c0909341SAndroid Build Coastguard Worker%else
902*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0], m16
903*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1], m17
904*c0909341SAndroid Build Coastguard Worker%endif
905*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+strideq*2]
906*c0909341SAndroid Build Coastguard Worker    ret
907*c0909341SAndroid Build Coastguard Worker%else
908*c0909341SAndroid Build Coastguard Worker%if %2
909*c0909341SAndroid Build Coastguard Worker    pand             m2, m4
910*c0909341SAndroid Build Coastguard Worker    pand             m3, m4
911*c0909341SAndroid Build Coastguard Worker%else
912*c0909341SAndroid Build Coastguard Worker    pand             m2, m4, [lumaq+lstrideq*0]
913*c0909341SAndroid Build Coastguard Worker    pand             m3, m4, [lumaq+lstrideq*1]
914*c0909341SAndroid Build Coastguard Worker%endif
915*c0909341SAndroid Build Coastguard Worker    jmp .add_noise_main
916*c0909341SAndroid Build Coastguard Worker%endif
917*c0909341SAndroid Build Coastguard Worker%endmacro
918*c0909341SAndroid Build Coastguard Worker
919*c0909341SAndroid Build Coastguard Worker    %%FGUV_32x32xN_LOOP 1, %2, %3
920*c0909341SAndroid Build Coastguard Worker.csfl:
921*c0909341SAndroid Build Coastguard Worker    %%FGUV_32x32xN_LOOP 0, %2, %3
922*c0909341SAndroid Build Coastguard Worker.end:
923*c0909341SAndroid Build Coastguard Worker    RET
924*c0909341SAndroid Build Coastguard Worker%endmacro
925*c0909341SAndroid Build Coastguard Worker
926*c0909341SAndroid Build Coastguard WorkerFGUV_FN 420, 1, 1
927*c0909341SAndroid Build Coastguard WorkerFGUV_FN 422, 1, 0
928*c0909341SAndroid Build Coastguard WorkerFGUV_FN 444, 0, 0
929*c0909341SAndroid Build Coastguard Worker
930*c0909341SAndroid Build Coastguard Worker%endif
931