xref: /aosp_15_r20/external/libdav1d/src/x86/filmgrain_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker%include "x86/filmgrain_common.asm"
29*c0909341SAndroid Build Coastguard Worker
30*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
33*c0909341SAndroid Build Coastguard Worker
34*c0909341SAndroid Build Coastguard Workerpb_even:       db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
35*c0909341SAndroid Build Coastguard Worker               db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
36*c0909341SAndroid Build Coastguard Worker               db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
37*c0909341SAndroid Build Coastguard Worker               db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
38*c0909341SAndroid Build Coastguard Workerpb_odd:        db  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
39*c0909341SAndroid Build Coastguard Worker               db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
40*c0909341SAndroid Build Coastguard Worker               db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
41*c0909341SAndroid Build Coastguard Worker               db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
42*c0909341SAndroid Build Coastguard Workerinterleave_hl: db  8,  0,  9,  1, 10,  2, 11,  3, 12,  4, 13,  5, 14,  6, 15,  7
43*c0909341SAndroid Build Coastguard Workerpb_27_17_17_27:        db 27, 17, 17, 27,  0, 32,  0, 32
44*c0909341SAndroid Build Coastguard Workerpb_23_22_0_32:         db 23, 22,  0, 32,  0, 32,  0, 32
45*c0909341SAndroid Build Coastguard Workerpb_27_17:      times 2 db 27, 17
46*c0909341SAndroid Build Coastguard Workerpb_23_22:      times 2 db 23, 22
47*c0909341SAndroid Build Coastguard Workerpw_8:          times 2 dw 8
48*c0909341SAndroid Build Coastguard Workerpw_1024:       times 2 dw 1024
49*c0909341SAndroid Build Coastguard Workerpb_17_27:      times 2 db 17, 27
50*c0909341SAndroid Build Coastguard Workerfg_max:        times 4 db 255
51*c0909341SAndroid Build Coastguard Worker               times 4 db 240
52*c0909341SAndroid Build Coastguard Worker               times 4 db 235
53*c0909341SAndroid Build Coastguard Workerfg_min:        times 4 db 0
54*c0909341SAndroid Build Coastguard Worker               times 4 db 16
55*c0909341SAndroid Build Coastguard Workernoise_rnd:     times 2 dw 128
56*c0909341SAndroid Build Coastguard Worker               times 2 dw 64
57*c0909341SAndroid Build Coastguard Worker               times 2 dw 32
58*c0909341SAndroid Build Coastguard Worker               times 2 dw 16
59*c0909341SAndroid Build Coastguard Worker
60*c0909341SAndroid Build Coastguard WorkerSECTION .text
61*c0909341SAndroid Build Coastguard Worker
62*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
63*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \
64*c0909341SAndroid Build Coastguard Worker                                     grain_lut, h, sby, see, overlap
65*c0909341SAndroid Build Coastguard Worker%define base r11-fg_min
66*c0909341SAndroid Build Coastguard Worker    lea             r11, [fg_min]
67*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.scaling_shift]
68*c0909341SAndroid Build Coastguard Worker    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
69*c0909341SAndroid Build Coastguard Worker    mov            sbyd, sbym
70*c0909341SAndroid Build Coastguard Worker    mov        overlapd, [fg_dataq+FGData.overlap_flag]
71*c0909341SAndroid Build Coastguard Worker    mov             r12, 0x0000000f0000000f ; h_overlap mask
72*c0909341SAndroid Build Coastguard Worker    mova             m0, [scalingq+64*0]
73*c0909341SAndroid Build Coastguard Worker    mova             m1, [scalingq+64*1]
74*c0909341SAndroid Build Coastguard Worker    mova             m2, [scalingq+64*2]
75*c0909341SAndroid Build Coastguard Worker    mova             m3, [scalingq+64*3]
76*c0909341SAndroid Build Coastguard Worker    kmovq            k1, r12
77*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4  m4, [base+interleave_hl]
78*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   ym16, [base+pb_27_17]
79*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m12, [base+pb_17_27]
80*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m6, [base+noise_rnd+r6*4-32]
81*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
82*c0909341SAndroid Build Coastguard Worker    setnz           r6b
83*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m7, [base+fg_min+r7*4]
84*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m8, [base+fg_max+r7*8]
85*c0909341SAndroid Build Coastguard Worker    pxor             m5, m5
86*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m9, [base+pw_1024]
87*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m10, [base+pb_27_17_17_27]
88*c0909341SAndroid Build Coastguard Worker    vmovdqa64   m12{k1}, m16
89*c0909341SAndroid Build Coastguard Worker    test            r6b, overlapb
90*c0909341SAndroid Build Coastguard Worker    jnz .v_overlap
91*c0909341SAndroid Build Coastguard Worker
92*c0909341SAndroid Build Coastguard Worker    imul           seed, sbyd, (173 << 24) | 37
93*c0909341SAndroid Build Coastguard Worker    add            seed, (105 << 24) | 178
94*c0909341SAndroid Build Coastguard Worker    rorx           seed, seed, 24
95*c0909341SAndroid Build Coastguard Worker    movzx          seed, seew
96*c0909341SAndroid Build Coastguard Worker    xor            seed, [fg_dataq+FGData.seed]
97*c0909341SAndroid Build Coastguard Worker
98*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
99*c0909341SAndroid Build Coastguard Worker                h, sby, see, overlap
100*c0909341SAndroid Build Coastguard Worker
101*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq]
102*c0909341SAndroid Build Coastguard Worker    neg              wq
103*c0909341SAndroid Build Coastguard Worker    sub            dstq, srcq
104*c0909341SAndroid Build Coastguard Worker.loop_x:
105*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
106*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4
107*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
108*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
109*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d                 ; updated seed
110*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
111*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
112*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
113*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
114*c0909341SAndroid Build Coastguard Worker    lea           offxd, [offyq+offxq*2+829] ; offy*stride+offx
115*c0909341SAndroid Build Coastguard Worker
116*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
117*c0909341SAndroid Build Coastguard Worker                h, sby, see, overlap
118*c0909341SAndroid Build Coastguard Worker
119*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
120*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
121*c0909341SAndroid Build Coastguard Worker.loop_y:
122*c0909341SAndroid Build Coastguard Worker    movu           ym21, [grain_lutq+offxyq-82]
123*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m21, [grain_lutq+offxyq+ 0], 1
124*c0909341SAndroid Build Coastguard Worker    call .add_noise
125*c0909341SAndroid Build Coastguard Worker    sub              hb, 2
126*c0909341SAndroid Build Coastguard Worker    jg .loop_y
127*c0909341SAndroid Build Coastguard Worker    add              wq, 32
128*c0909341SAndroid Build Coastguard Worker    jge .end
129*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
130*c0909341SAndroid Build Coastguard Worker    test       overlapd, overlapd
131*c0909341SAndroid Build Coastguard Worker    jz .loop_x
132*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
133*c0909341SAndroid Build Coastguard Worker    jnz .hv_overlap
134*c0909341SAndroid Build Coastguard Worker
135*c0909341SAndroid Build Coastguard Worker.loop_x_h_overlap:
136*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
137*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4
138*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
139*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
140*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d                 ; updated seed
141*c0909341SAndroid Build Coastguard Worker
142*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
143*c0909341SAndroid Build Coastguard Worker                h, sby, see, left_offxy
144*c0909341SAndroid Build Coastguard Worker
145*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
146*c0909341SAndroid Build Coastguard Worker    mov     left_offxyd, offxd               ; previous column's offy*stride
147*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
148*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
149*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
150*c0909341SAndroid Build Coastguard Worker    lea           offxd, [offyq+offxq*2+829] ; offy*stride+offx
151*c0909341SAndroid Build Coastguard Worker
152*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
153*c0909341SAndroid Build Coastguard Worker                h, sby, see, left_offxy
154*c0909341SAndroid Build Coastguard Worker
155*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
156*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
157*c0909341SAndroid Build Coastguard Worker.loop_y_h_overlap:
158*c0909341SAndroid Build Coastguard Worker    movu           ym20, [grain_lutq+offxyq-82]
159*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m20, [grain_lutq+offxyq+ 0], 1
160*c0909341SAndroid Build Coastguard Worker    movd           xm19, [grain_lutq+left_offxyq-50]
161*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m19, [grain_lutq+left_offxyq+32], 2
162*c0909341SAndroid Build Coastguard Worker    punpcklbw       m19, m20
163*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m19, m10, m19
164*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m19, m9
165*c0909341SAndroid Build Coastguard Worker    punpckhbw       m21, m20, m5
166*c0909341SAndroid Build Coastguard Worker    packsswb    m20{k1}, m19, m19
167*c0909341SAndroid Build Coastguard Worker    punpcklbw       m20, m5, m20
168*c0909341SAndroid Build Coastguard Worker    call .add_noise_h
169*c0909341SAndroid Build Coastguard Worker    sub              hb, 2
170*c0909341SAndroid Build Coastguard Worker    jg .loop_y_h_overlap
171*c0909341SAndroid Build Coastguard Worker    add              wq, 32
172*c0909341SAndroid Build Coastguard Worker    jge .end
173*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
174*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
175*c0909341SAndroid Build Coastguard Worker    jnz .hv_overlap
176*c0909341SAndroid Build Coastguard Worker    jmp .loop_x_h_overlap
177*c0909341SAndroid Build Coastguard Worker
178*c0909341SAndroid Build Coastguard Worker.v_overlap:
179*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \
180*c0909341SAndroid Build Coastguard Worker                h, sby, see, overlap
181*c0909341SAndroid Build Coastguard Worker
182*c0909341SAndroid Build Coastguard Worker    movzx           r6d, sbyb
183*c0909341SAndroid Build Coastguard Worker    imul           seed, [fg_dataq+FGData.seed], 0x00010001
184*c0909341SAndroid Build Coastguard Worker    imul            r7d, r6d, 173 * 0x00010001
185*c0909341SAndroid Build Coastguard Worker    imul            r6d, 37 * 0x01000100
186*c0909341SAndroid Build Coastguard Worker    add             r7d, (105 << 16) | 188
187*c0909341SAndroid Build Coastguard Worker    add             r6d, (178 << 24) | (141 << 8)
188*c0909341SAndroid Build Coastguard Worker    and             r7d, 0x00ff00ff
189*c0909341SAndroid Build Coastguard Worker    and             r6d, 0xff00ff00
190*c0909341SAndroid Build Coastguard Worker    xor            seed, r7d
191*c0909341SAndroid Build Coastguard Worker    xor            seed, r6d     ; (cur_seed << 16) | top_seed
192*c0909341SAndroid Build Coastguard Worker
193*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
194*c0909341SAndroid Build Coastguard Worker                h, sby, see, overlap
195*c0909341SAndroid Build Coastguard Worker
196*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq]
197*c0909341SAndroid Build Coastguard Worker    neg              wq
198*c0909341SAndroid Build Coastguard Worker    sub            dstq, srcq
199*c0909341SAndroid Build Coastguard Worker
200*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
201*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
202*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
203*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
204*c0909341SAndroid Build Coastguard Worker    setp            r7b          ; parity of top_seed
205*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
206*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
207*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
208*c0909341SAndroid Build Coastguard Worker    setp            r7b          ; parity of cur_seed
209*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
210*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
211*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
212*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
213*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
214*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
215*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
216*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
217*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
218*c0909341SAndroid Build Coastguard Worker    lea           offxd, [offyq+offxq*2+0x10001*829+32*82]
219*c0909341SAndroid Build Coastguard Worker
220*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
221*c0909341SAndroid Build Coastguard Worker                h, sby, see, overlap, top_offxy
222*c0909341SAndroid Build Coastguard Worker
223*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
224*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
225*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
226*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
227*c0909341SAndroid Build Coastguard Worker    movu           ym19, [grain_lutq+offxyq-82]
228*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m19, [grain_lutq+offxyq+ 0], 1
229*c0909341SAndroid Build Coastguard Worker    movu           ym21, [grain_lutq+top_offxyq-82]
230*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m21, [grain_lutq+top_offxyq+ 0], 1
231*c0909341SAndroid Build Coastguard Worker    punpckhbw       m20, m21, m19
232*c0909341SAndroid Build Coastguard Worker    punpcklbw       m21, m19
233*c0909341SAndroid Build Coastguard Worker    call .add_noise_v
234*c0909341SAndroid Build Coastguard Worker    sub              hb, 2
235*c0909341SAndroid Build Coastguard Worker    jg .loop_y
236*c0909341SAndroid Build Coastguard Worker    add              wq, 32
237*c0909341SAndroid Build Coastguard Worker    jge .end
238*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
239*c0909341SAndroid Build Coastguard Worker
240*c0909341SAndroid Build Coastguard Worker    ; since fg_dataq.overlap is guaranteed to be set, we never jump back
241*c0909341SAndroid Build Coastguard Worker    ; to .v_overlap, and instead always fall-through to h+v overlap
242*c0909341SAndroid Build Coastguard Worker.hv_overlap:
243*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
244*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
245*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
246*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
247*c0909341SAndroid Build Coastguard Worker    setp            r7b          ; parity of top_seed
248*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
249*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
250*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
251*c0909341SAndroid Build Coastguard Worker    setp            r7b          ; parity of cur_seed
252*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
253*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
254*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
255*c0909341SAndroid Build Coastguard Worker
256*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
257*c0909341SAndroid Build Coastguard Worker                h, sby, see, left_offxy, top_offxy, topleft_offxy
258*c0909341SAndroid Build Coastguard Worker
259*c0909341SAndroid Build Coastguard Worker    mov  topleft_offxyd, top_offxyd
260*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
261*c0909341SAndroid Build Coastguard Worker    mov     left_offxyd, offxd
262*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
263*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
264*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
265*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
266*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
267*c0909341SAndroid Build Coastguard Worker    lea           offxd, [offyq+offxq*2+0x10001*829+32*82]
268*c0909341SAndroid Build Coastguard Worker
269*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
270*c0909341SAndroid Build Coastguard Worker                h, sby, see, left_offxy, top_offxy, topleft_offxy
271*c0909341SAndroid Build Coastguard Worker
272*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
273*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
274*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
275*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
276*c0909341SAndroid Build Coastguard Worker    movu           ym19, [grain_lutq+offxyq-82]
277*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m19, [grain_lutq+offxyq+ 0], 1
278*c0909341SAndroid Build Coastguard Worker    movd           xm16, [grain_lutq+left_offxyq-50]
279*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m16, [grain_lutq+left_offxyq+32], 2
280*c0909341SAndroid Build Coastguard Worker    movu           ym21, [grain_lutq+top_offxyq-82]
281*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m21, [grain_lutq+top_offxyq+ 0], 1
282*c0909341SAndroid Build Coastguard Worker    movd           xm17, [grain_lutq+topleft_offxyq-50]
283*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m17, [grain_lutq+topleft_offxyq+32], 2
284*c0909341SAndroid Build Coastguard Worker    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
285*c0909341SAndroid Build Coastguard Worker    punpcklbw       m16, m19
286*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m16, m10, m16
287*c0909341SAndroid Build Coastguard Worker    punpcklbw       m17, m21
288*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m17, m10, m17
289*c0909341SAndroid Build Coastguard Worker    punpckhbw       m20, m21, m19
290*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m16, m9
291*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m17, m9
292*c0909341SAndroid Build Coastguard Worker    packsswb    m19{k1}, m16, m16
293*c0909341SAndroid Build Coastguard Worker    packsswb    m21{k1}, m17, m17
294*c0909341SAndroid Build Coastguard Worker    ; followed by v interpolation (top | cur -> cur)
295*c0909341SAndroid Build Coastguard Worker    punpcklbw       m21, m19
296*c0909341SAndroid Build Coastguard Worker    call .add_noise_v
297*c0909341SAndroid Build Coastguard Worker    sub              hb, 2
298*c0909341SAndroid Build Coastguard Worker    jg .loop_y_h_overlap
299*c0909341SAndroid Build Coastguard Worker    add              wq, 32
300*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
301*c0909341SAndroid Build Coastguard Worker    jl .hv_overlap
302*c0909341SAndroid Build Coastguard Worker.end:
303*c0909341SAndroid Build Coastguard Worker    RET
304*c0909341SAndroid Build Coastguard WorkerALIGN function_align
305*c0909341SAndroid Build Coastguard Worker.add_noise_v:
306*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m20, m12, m20
307*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m21, m12, m21
308*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m20, m9
309*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m21, m9
310*c0909341SAndroid Build Coastguard Worker    packsswb        m21, m20
311*c0909341SAndroid Build Coastguard Worker.add_noise:
312*c0909341SAndroid Build Coastguard Worker    punpcklbw       m20, m5, m21
313*c0909341SAndroid Build Coastguard Worker    punpckhbw       m21, m5
314*c0909341SAndroid Build Coastguard Worker.add_noise_h:
315*c0909341SAndroid Build Coastguard Worker    mova           ym18, [srcq+strideq*0]
316*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m18, [srcq+strideq*1], 1
317*c0909341SAndroid Build Coastguard Worker    mova            m19, m0
318*c0909341SAndroid Build Coastguard Worker    punpcklbw       m16, m18, m5
319*c0909341SAndroid Build Coastguard Worker    vpermt2b        m19, m18, m1 ; scaling[  0..127]
320*c0909341SAndroid Build Coastguard Worker    vpmovb2m         k2, m18
321*c0909341SAndroid Build Coastguard Worker    punpckhbw       m17, m18, m5
322*c0909341SAndroid Build Coastguard Worker    vpermi2b        m18, m2, m3  ; scaling[128..255]
323*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m19{k2}, m18     ; scaling[src]
324*c0909341SAndroid Build Coastguard Worker    pshufb          m19, m4
325*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m18, m19, m20
326*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m19, m21
327*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*2
328*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m18, m6      ; noise
329*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m19, m6
330*c0909341SAndroid Build Coastguard Worker    paddw           m16, m18
331*c0909341SAndroid Build Coastguard Worker    paddw           m17, m19
332*c0909341SAndroid Build Coastguard Worker    packuswb        m16, m17
333*c0909341SAndroid Build Coastguard Worker    pmaxub          m16, m7
334*c0909341SAndroid Build Coastguard Worker    pminub          m16, m8
335*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], ym16
336*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
337*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+srcq], m16, 1
338*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
339*c0909341SAndroid Build Coastguard Worker    ret
340*c0909341SAndroid Build Coastguard Worker
341*c0909341SAndroid Build Coastguard Worker%macro FGUV_FN 3 ; name, ss_hor, ss_ver
342*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \
343*c0909341SAndroid Build Coastguard Worker                                             scaling, grain_lut, h, sby, luma, \
344*c0909341SAndroid Build Coastguard Worker                                             overlap, uv_pl, is_id, _, stride3
345*c0909341SAndroid Build Coastguard Worker    lea             r11, [fg_min]
346*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.scaling_shift]
347*c0909341SAndroid Build Coastguard Worker    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
348*c0909341SAndroid Build Coastguard Worker    mov             r9d, is_idm
349*c0909341SAndroid Build Coastguard Worker    mov            sbyd, sbym
350*c0909341SAndroid Build Coastguard Worker    mov        overlapd, [fg_dataq+FGData.overlap_flag]
351*c0909341SAndroid Build Coastguard Worker%if %2
352*c0909341SAndroid Build Coastguard Worker    mov             r12, 0x000f000f000f000f ; h_overlap mask
353*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m10, [base+pb_23_22_0_32]
354*c0909341SAndroid Build Coastguard Worker    lea        stride3q, [strideq*3]
355*c0909341SAndroid Build Coastguard Worker%else
356*c0909341SAndroid Build Coastguard Worker    mov             r12, 0x0000000f0000000f
357*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m10, [base+pb_27_17_17_27]
358*c0909341SAndroid Build Coastguard Worker%endif
359*c0909341SAndroid Build Coastguard Worker    mova             m0, [scalingq+64*0]
360*c0909341SAndroid Build Coastguard Worker    mova             m1, [scalingq+64*1]
361*c0909341SAndroid Build Coastguard Worker    mova             m2, [scalingq+64*2]
362*c0909341SAndroid Build Coastguard Worker    mova             m3, [scalingq+64*3]
363*c0909341SAndroid Build Coastguard Worker    kmovq            k1, r12
364*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4  m4, [base+interleave_hl]
365*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m6, [base+noise_rnd+r6*4-32]
366*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m7, [base+fg_min+r7*4]
367*c0909341SAndroid Build Coastguard Worker    shlx            r7d, r7d, r9d
368*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m8, [base+fg_max+r7*4]
369*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
370*c0909341SAndroid Build Coastguard Worker    setnz           r7b
371*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m9, [base+pw_1024]
372*c0909341SAndroid Build Coastguard Worker    mova            m11, [base+pb_even]
373*c0909341SAndroid Build Coastguard Worker    mova            m12, [base+pb_odd]
374*c0909341SAndroid Build Coastguard Worker    pxor             m5, m5
375*c0909341SAndroid Build Coastguard Worker    mov              r5, r10mp      ; lstride
376*c0909341SAndroid Build Coastguard Worker    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
377*c0909341SAndroid Build Coastguard Worker    jne .csfl
378*c0909341SAndroid Build Coastguard Worker
379*c0909341SAndroid Build Coastguard Worker%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
380*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
381*c0909341SAndroid Build Coastguard Worker                h, sby, see, overlap, uv_pl, _, _, stride3
382*c0909341SAndroid Build Coastguard Worker%if %1
383*c0909341SAndroid Build Coastguard Worker    mov             r6d, uv_plm
384*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m16, [base+pw_8]
385*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4]
386*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r6*4]
387*c0909341SAndroid Build Coastguard Worker    pshufb          m14, m16     ; uv_luma_mult, uv_mult
388*c0909341SAndroid Build Coastguard Worker%endif
389*c0909341SAndroid Build Coastguard Worker    test            r7b, overlapb
390*c0909341SAndroid Build Coastguard Worker    jnz %%v_overlap
391*c0909341SAndroid Build Coastguard Worker
392*c0909341SAndroid Build Coastguard Worker    imul           seed, sbyd, (173 << 24) | 37
393*c0909341SAndroid Build Coastguard Worker    add            seed, (105 << 24) | 178
394*c0909341SAndroid Build Coastguard Worker    rorx           seed, seed, 24
395*c0909341SAndroid Build Coastguard Worker    movzx          seed, seew
396*c0909341SAndroid Build Coastguard Worker    xor            seed, [fg_dataq+FGData.seed]
397*c0909341SAndroid Build Coastguard Worker
398*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
399*c0909341SAndroid Build Coastguard Worker                offx, offy, see, overlap, _, _, _, stride3
400*c0909341SAndroid Build Coastguard Worker
401*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
402*c0909341SAndroid Build Coastguard Worker    lea             r11, [srcq+wq]
403*c0909341SAndroid Build Coastguard Worker    lea             r12, [dstq+wq]
404*c0909341SAndroid Build Coastguard Worker    lea             r13, [lumaq+wq*(1+%2)]
405*c0909341SAndroid Build Coastguard Worker    mov           r11mp, r11
406*c0909341SAndroid Build Coastguard Worker    mov           r12mp, r12
407*c0909341SAndroid Build Coastguard Worker    neg              wq
408*c0909341SAndroid Build Coastguard Worker
409*c0909341SAndroid Build Coastguard Worker%%loop_x:
410*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
411*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4
412*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
413*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
414*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d     ; updated seed
415*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
416*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
417*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
418*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
419*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
420*c0909341SAndroid Build Coastguard Worker
421*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
422*c0909341SAndroid Build Coastguard Worker                h, offxy, see, overlap, _, _, _, stride3
423*c0909341SAndroid Build Coastguard Worker
424*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
425*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
426*c0909341SAndroid Build Coastguard Worker%%loop_y:
427*c0909341SAndroid Build Coastguard Worker%if %2
428*c0909341SAndroid Build Coastguard Worker    movu           xm21, [grain_lutq+offxyq+82*0]
429*c0909341SAndroid Build Coastguard Worker    vinserti128    ym21, [grain_lutq+offxyq+82*1], 1
430*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
431*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
432*c0909341SAndroid Build Coastguard Worker%else
433*c0909341SAndroid Build Coastguard Worker    movu           ym21, [grain_lutq+offxyq+82*0]
434*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m21, [grain_lutq+offxyq+82*1], 1
435*c0909341SAndroid Build Coastguard Worker%endif
436*c0909341SAndroid Build Coastguard Worker    call %%add_noise
437*c0909341SAndroid Build Coastguard Worker    sub              hb, 2<<%2
438*c0909341SAndroid Build Coastguard Worker    jg %%loop_y
439*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
440*c0909341SAndroid Build Coastguard Worker    jge .end
441*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
442*c0909341SAndroid Build Coastguard Worker    mov            dstq, r12mp
443*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [r13+wq*(1<<%2)]
444*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
445*c0909341SAndroid Build Coastguard Worker    add            dstq, wq
446*c0909341SAndroid Build Coastguard Worker    test       overlapd, overlapd
447*c0909341SAndroid Build Coastguard Worker    jz %%loop_x
448*c0909341SAndroid Build Coastguard Worker    cmp       dword r8m, 0       ; sby
449*c0909341SAndroid Build Coastguard Worker    jne %%hv_overlap
450*c0909341SAndroid Build Coastguard Worker
451*c0909341SAndroid Build Coastguard Worker    ; horizontal overlap (without vertical overlap)
452*c0909341SAndroid Build Coastguard Worker%%loop_x_h_overlap:
453*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
454*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4
455*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
456*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
457*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d     ; updated seed
458*c0909341SAndroid Build Coastguard Worker
459*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
460*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy, _, _, _, stride3
461*c0909341SAndroid Build Coastguard Worker
462*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyq+(32>>%2)]         ; previous column's offy*stride+offx
463*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
464*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
465*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
466*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
467*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
468*c0909341SAndroid Build Coastguard Worker
469*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
470*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy, _, _, _, stride3
471*c0909341SAndroid Build Coastguard Worker
472*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
473*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
474*c0909341SAndroid Build Coastguard Worker%%loop_y_h_overlap:
475*c0909341SAndroid Build Coastguard Worker%if %2
476*c0909341SAndroid Build Coastguard Worker    movu           xm20, [grain_lutq+offxyq     +82*0]
477*c0909341SAndroid Build Coastguard Worker    movd           xm19, [grain_lutq+left_offxyq+82*0]
478*c0909341SAndroid Build Coastguard Worker    vinserti32x4   ym20, [grain_lutq+offxyq     +82*1], 1
479*c0909341SAndroid Build Coastguard Worker    vinserti32x4   ym19, [grain_lutq+left_offxyq+82*1], 1
480*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m20, [grain_lutq+offxyq     +82*2], 2
481*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m19, [grain_lutq+left_offxyq+82*2], 2
482*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m20, [grain_lutq+offxyq     +82*3], 3
483*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m19, [grain_lutq+left_offxyq+82*3], 3
484*c0909341SAndroid Build Coastguard Worker%else
485*c0909341SAndroid Build Coastguard Worker    movu           ym20, [grain_lutq+offxyq     + 0]
486*c0909341SAndroid Build Coastguard Worker    movd           xm19, [grain_lutq+left_offxyq+ 0]
487*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m20, [grain_lutq+offxyq     +82], 1
488*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m19, [grain_lutq+left_offxyq+82], 2
489*c0909341SAndroid Build Coastguard Worker%endif
490*c0909341SAndroid Build Coastguard Worker    punpcklbw       m19, m20
491*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m19, m10, m19
492*c0909341SAndroid Build Coastguard Worker    punpckhbw       m21, m20, m5
493*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m19, m9
494*c0909341SAndroid Build Coastguard Worker    vpacksswb   m20{k1}, m19, m19
495*c0909341SAndroid Build Coastguard Worker    punpcklbw       m20, m5, m20
496*c0909341SAndroid Build Coastguard Worker    call %%add_noise_h
497*c0909341SAndroid Build Coastguard Worker    sub              hb, 2<<%2
498*c0909341SAndroid Build Coastguard Worker    jg %%loop_y_h_overlap
499*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
500*c0909341SAndroid Build Coastguard Worker    jge .end
501*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
502*c0909341SAndroid Build Coastguard Worker    mov            dstq, r12mp
503*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [r13+wq*(1<<%2)]
504*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
505*c0909341SAndroid Build Coastguard Worker    add            dstq, wq
506*c0909341SAndroid Build Coastguard Worker    cmp       dword r8m, 0       ; sby
507*c0909341SAndroid Build Coastguard Worker    jne %%hv_overlap
508*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_h_overlap
509*c0909341SAndroid Build Coastguard Worker
510*c0909341SAndroid Build Coastguard Worker%%v_overlap:
511*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
512*c0909341SAndroid Build Coastguard Worker                _, sby, see, overlap, _, _, _, stride3
513*c0909341SAndroid Build Coastguard Worker
514*c0909341SAndroid Build Coastguard Worker    movzx          sbyd, sbyb
515*c0909341SAndroid Build Coastguard Worker    imul           seed, [fg_dataq+FGData.seed], 0x00010001
516*c0909341SAndroid Build Coastguard Worker    imul            r7d, sbyd, 173 * 0x00010001
517*c0909341SAndroid Build Coastguard Worker    imul           sbyd, 37 * 0x01000100
518*c0909341SAndroid Build Coastguard Worker    add             r7d, (105 << 16) | 188
519*c0909341SAndroid Build Coastguard Worker    add            sbyd, (178 << 24) | (141 << 8)
520*c0909341SAndroid Build Coastguard Worker    and             r7d, 0x00ff00ff
521*c0909341SAndroid Build Coastguard Worker    and            sbyd, 0xff00ff00
522*c0909341SAndroid Build Coastguard Worker    xor            seed, r7d
523*c0909341SAndroid Build Coastguard Worker    xor            seed, sbyd    ; (cur_seed << 16) | top_seed
524*c0909341SAndroid Build Coastguard Worker
525*c0909341SAndroid Build Coastguard Worker%if %3
526*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m13, [base+pb_23_22]
527*c0909341SAndroid Build Coastguard Worker    kxnorw           k3, k3, k3  ; v_overlap mask
528*c0909341SAndroid Build Coastguard Worker%elif %2
529*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8 m13, [base+pb_27_17]
530*c0909341SAndroid Build Coastguard Worker    kxnord           k3, k3, k3
531*c0909341SAndroid Build Coastguard Worker    pshufd          m13, m13, q0000 ; 8x27_17, 8x17_27
532*c0909341SAndroid Build Coastguard Worker%else
533*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   ym16, [base+pb_27_17]
534*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m13, [base+pb_17_27]
535*c0909341SAndroid Build Coastguard Worker    vmovdqa64   m13{k1}, m16
536*c0909341SAndroid Build Coastguard Worker%endif
537*c0909341SAndroid Build Coastguard Worker
538*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
539*c0909341SAndroid Build Coastguard Worker                offx, offy, see, overlap, top_offxy, _, _, stride3
540*c0909341SAndroid Build Coastguard Worker
541*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
542*c0909341SAndroid Build Coastguard Worker    lea             r11, [srcq+wq]
543*c0909341SAndroid Build Coastguard Worker    lea             r12, [dstq+wq]
544*c0909341SAndroid Build Coastguard Worker    lea             r13, [lumaq+wq*(1<<%2)]
545*c0909341SAndroid Build Coastguard Worker    mov           r11mp, r11
546*c0909341SAndroid Build Coastguard Worker    mov           r12mp, r12
547*c0909341SAndroid Build Coastguard Worker    neg              wq
548*c0909341SAndroid Build Coastguard Worker
549*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
550*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
551*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
552*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
553*c0909341SAndroid Build Coastguard Worker    setp            r7b          ; parity of top_seed
554*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
555*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
556*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
557*c0909341SAndroid Build Coastguard Worker    setp            r7b          ; parity of cur_seed
558*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
559*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
560*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
561*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
562*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
563*c0909341SAndroid Build Coastguard Worker    and           offyd, 0x000f000f
564*c0909341SAndroid Build Coastguard Worker    and           offxd, 0x000f000f
565*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
566*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
567*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
568*c0909341SAndroid Build Coastguard Worker
569*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
570*c0909341SAndroid Build Coastguard Worker                h, offxy, see, overlap, top_offxy, _, _, stride3
571*c0909341SAndroid Build Coastguard Worker
572*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
573*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
574*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
575*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
576*c0909341SAndroid Build Coastguard Worker
577*c0909341SAndroid Build Coastguard Worker%if %3
578*c0909341SAndroid Build Coastguard Worker    movu           xm18, [grain_lutq+offxyq+82*0]
579*c0909341SAndroid Build Coastguard Worker    movu           xm20, [grain_lutq+top_offxyq+82*0]
580*c0909341SAndroid Build Coastguard Worker    ; only interpolate first line, insert remaining line unmodified
581*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 ym21, [grain_lutq+offxyq+82*1]
582*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
583*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
584*c0909341SAndroid Build Coastguard Worker    punpcklbw      xm19, xm20, xm18
585*c0909341SAndroid Build Coastguard Worker    punpckhbw      xm20, xm18
586*c0909341SAndroid Build Coastguard Worker%elif %2
587*c0909341SAndroid Build Coastguard Worker    movu           xm18, [grain_lutq+offxyq+82*0]
588*c0909341SAndroid Build Coastguard Worker    vinserti128    ym18, [grain_lutq+offxyq+82*1], 1
589*c0909341SAndroid Build Coastguard Worker    movu           xm20, [grain_lutq+top_offxyq+82*0]
590*c0909341SAndroid Build Coastguard Worker    vinserti32x4   ym20, [grain_lutq+top_offxyq+82*1], 1
591*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2]
592*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
593*c0909341SAndroid Build Coastguard Worker    punpcklbw      ym19, ym20, ym18
594*c0909341SAndroid Build Coastguard Worker    punpckhbw      ym20, ym18
595*c0909341SAndroid Build Coastguard Worker%else
596*c0909341SAndroid Build Coastguard Worker    movu           ym21, [grain_lutq+offxyq+82*0]
597*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m21, [grain_lutq+offxyq+82*1], 1
598*c0909341SAndroid Build Coastguard Worker    movu           ym20, [grain_lutq+top_offxyq+82*0]
599*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m20, [grain_lutq+top_offxyq+82*1], 1
600*c0909341SAndroid Build Coastguard Worker%endif
601*c0909341SAndroid Build Coastguard Worker    call %%add_noise_v
602*c0909341SAndroid Build Coastguard Worker    sub              hb, 2<<%2
603*c0909341SAndroid Build Coastguard Worker    jg %%loop_y
604*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
605*c0909341SAndroid Build Coastguard Worker    jge .end
606*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
607*c0909341SAndroid Build Coastguard Worker    mov            dstq, r12mp
608*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [r13+wq*(1<<%2)]
609*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
610*c0909341SAndroid Build Coastguard Worker    add            dstq, wq
611*c0909341SAndroid Build Coastguard Worker
612*c0909341SAndroid Build Coastguard Worker%%hv_overlap:
613*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
614*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
615*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
616*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
617*c0909341SAndroid Build Coastguard Worker    setp            r7b          ; parity of top_seed
618*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
619*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
620*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
621*c0909341SAndroid Build Coastguard Worker    setp            r7b          ; parity of cur_seed
622*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
623*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
624*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
625*c0909341SAndroid Build Coastguard Worker
626*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
627*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
628*c0909341SAndroid Build Coastguard Worker
629*c0909341SAndroid Build Coastguard Worker    lea  topleft_offxyd, [top_offxyq+(32>>%2)]
630*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyq+(32>>%2)]
631*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
632*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
633*c0909341SAndroid Build Coastguard Worker    and           offyd, 0x000f000f
634*c0909341SAndroid Build Coastguard Worker    and           offxd, 0x000f000f
635*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
636*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
637*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
638*c0909341SAndroid Build Coastguard Worker
639*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
640*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
641*c0909341SAndroid Build Coastguard Worker
642*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
643*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
644*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
645*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
646*c0909341SAndroid Build Coastguard Worker
647*c0909341SAndroid Build Coastguard Worker%if %2
648*c0909341SAndroid Build Coastguard Worker    movu           xm21, [grain_lutq+offxyq+82*0]
649*c0909341SAndroid Build Coastguard Worker    movd           xm16, [grain_lutq+left_offxyq+82*0]
650*c0909341SAndroid Build Coastguard Worker    vinserti128    ym21, [grain_lutq+offxyq+82*1], 1
651*c0909341SAndroid Build Coastguard Worker    vinserti128    ym16, [grain_lutq+left_offxyq+82*1], 1
652*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
653*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m16, [grain_lutq+left_offxyq+82*2], 2
654*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
655*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m16, [grain_lutq+left_offxyq+82*3], 3
656*c0909341SAndroid Build Coastguard Worker    movd           xm18, [grain_lutq+topleft_offxyq+82*0]
657*c0909341SAndroid Build Coastguard Worker    movu           xm20, [grain_lutq+top_offxyq]
658*c0909341SAndroid Build Coastguard Worker    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
659*c0909341SAndroid Build Coastguard Worker    punpcklbw       m16, m21
660*c0909341SAndroid Build Coastguard Worker%if %3
661*c0909341SAndroid Build Coastguard Worker    punpcklbw      xm18, xm20
662*c0909341SAndroid Build Coastguard Worker%else
663*c0909341SAndroid Build Coastguard Worker    vinserti128    ym18, [grain_lutq+topleft_offxyq+82*1], 1
664*c0909341SAndroid Build Coastguard Worker    vinserti128    ym20, [grain_lutq+top_offxyq+82*1], 1
665*c0909341SAndroid Build Coastguard Worker    punpcklbw      ym18, ym20
666*c0909341SAndroid Build Coastguard Worker%endif
667*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m16, m18
668*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m16, m10, m16
669*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m16, m9
670*c0909341SAndroid Build Coastguard Worker    packsswb        m16, m16
671*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m21{k1}, m16
672*c0909341SAndroid Build Coastguard Worker%if %3
673*c0909341SAndroid Build Coastguard Worker    vpalignr   xm20{k1}, xm16, xm16, 4
674*c0909341SAndroid Build Coastguard Worker    punpcklbw      xm19, xm20, xm21
675*c0909341SAndroid Build Coastguard Worker    punpckhbw      xm20, xm21
676*c0909341SAndroid Build Coastguard Worker%else
677*c0909341SAndroid Build Coastguard Worker    vpalignr   ym20{k1}, ym16, ym16, 4
678*c0909341SAndroid Build Coastguard Worker    punpcklbw      ym19, ym20, ym21
679*c0909341SAndroid Build Coastguard Worker    punpckhbw      ym20, ym21
680*c0909341SAndroid Build Coastguard Worker%endif
681*c0909341SAndroid Build Coastguard Worker%else
682*c0909341SAndroid Build Coastguard Worker    movu           ym21, [grain_lutq+offxyq+82*0]
683*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m21, [grain_lutq+offxyq+82*1], 1
684*c0909341SAndroid Build Coastguard Worker    movd           xm16, [grain_lutq+left_offxyq+82*0]
685*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m16, [grain_lutq+left_offxyq+82*1], 2
686*c0909341SAndroid Build Coastguard Worker    movu           ym20, [grain_lutq+top_offxyq+82*0]
687*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m20, [grain_lutq+top_offxyq+82*1], 1
688*c0909341SAndroid Build Coastguard Worker    movd           xm18, [grain_lutq+topleft_offxyq+82*0]
689*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m18, [grain_lutq+topleft_offxyq+82*1], 2
690*c0909341SAndroid Build Coastguard Worker    punpcklbw       m16, m21
691*c0909341SAndroid Build Coastguard Worker    punpcklbw       m18, m20
692*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m16, m18
693*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m16, m10, m16
694*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m16, m9
695*c0909341SAndroid Build Coastguard Worker    packsswb        m16, m16
696*c0909341SAndroid Build Coastguard Worker    vpalignr    m20{k1}, m16, m16, 4
697*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m21{k1}, m16
698*c0909341SAndroid Build Coastguard Worker%endif
699*c0909341SAndroid Build Coastguard Worker    call %%add_noise_v
700*c0909341SAndroid Build Coastguard Worker    sub              hb, 2<<%2
701*c0909341SAndroid Build Coastguard Worker    jg %%loop_y_h_overlap
702*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
703*c0909341SAndroid Build Coastguard Worker    jge .end
704*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
705*c0909341SAndroid Build Coastguard Worker    mov            dstq, r12mp
706*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [r13+wq*(1<<%2)]
707*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
708*c0909341SAndroid Build Coastguard Worker    add            dstq, wq
709*c0909341SAndroid Build Coastguard Worker    jmp %%hv_overlap
710*c0909341SAndroid Build Coastguard WorkerALIGN function_align
711*c0909341SAndroid Build Coastguard Worker%%add_noise_v:
712*c0909341SAndroid Build Coastguard Worker%if %3
713*c0909341SAndroid Build Coastguard Worker    pmaddubsw      xm19, xm13, xm19
714*c0909341SAndroid Build Coastguard Worker    pmaddubsw      xm20, xm13, xm20
715*c0909341SAndroid Build Coastguard Worker    pmulhrsw       xm19, xm9
716*c0909341SAndroid Build Coastguard Worker    pmulhrsw       xm20, xm9
717*c0909341SAndroid Build Coastguard Worker    vpacksswb   m21{k3}, m19, m20
718*c0909341SAndroid Build Coastguard Worker%elif %2
719*c0909341SAndroid Build Coastguard Worker    pmaddubsw      ym19, ym13, ym19
720*c0909341SAndroid Build Coastguard Worker    pmaddubsw      ym20, ym13, ym20
721*c0909341SAndroid Build Coastguard Worker    pmulhrsw       ym19, ym9
722*c0909341SAndroid Build Coastguard Worker    pmulhrsw       ym20, ym9
723*c0909341SAndroid Build Coastguard Worker    vpacksswb   m21{k3}, m19, m20
724*c0909341SAndroid Build Coastguard Worker%else
725*c0909341SAndroid Build Coastguard Worker    punpcklbw       m19, m20, m21
726*c0909341SAndroid Build Coastguard Worker    punpckhbw       m20, m21
727*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m19, m13, m19
728*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m20, m13, m20
729*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m19, m9
730*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m20, m9
731*c0909341SAndroid Build Coastguard Worker    packsswb        m21, m19, m20
732*c0909341SAndroid Build Coastguard Worker%endif
733*c0909341SAndroid Build Coastguard Worker%%add_noise:
734*c0909341SAndroid Build Coastguard Worker    punpcklbw       m20, m5, m21
735*c0909341SAndroid Build Coastguard Worker    punpckhbw       m21, m5
736*c0909341SAndroid Build Coastguard Worker%%add_noise_h:
737*c0909341SAndroid Build Coastguard Worker    mova           ym18, [lumaq+lstrideq*(0<<%3)]
738*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m18, [lumaq+lstrideq*(1<<%3)], 1
739*c0909341SAndroid Build Coastguard Worker%if %2
740*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
741*c0909341SAndroid Build Coastguard Worker    mova           ym16, [lumaq+lstrideq*(0<<%3)]
742*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m16, [lumaq+lstrideq*(1<<%3)], 1
743*c0909341SAndroid Build Coastguard Worker    mova           xm17, [srcq+strideq*0]
744*c0909341SAndroid Build Coastguard Worker    mova            m19, m11
745*c0909341SAndroid Build Coastguard Worker    vpermi2b        m19, m18, m16
746*c0909341SAndroid Build Coastguard Worker    vinserti128    ym17, [srcq+strideq*1], 1
747*c0909341SAndroid Build Coastguard Worker    vpermt2b        m18, m12, m16
748*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m17, [srcq+strideq*2], 2
749*c0909341SAndroid Build Coastguard Worker    pavgb           m18, m19
750*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m17, [srcq+stride3q ], 3
751*c0909341SAndroid Build Coastguard Worker%else
752*c0909341SAndroid Build Coastguard Worker    mova           ym17, [srcq+strideq*0]
753*c0909341SAndroid Build Coastguard Worker    vinserti32x8    m17, [srcq+strideq*1], 1
754*c0909341SAndroid Build Coastguard Worker%endif
755*c0909341SAndroid Build Coastguard Worker%if %1
756*c0909341SAndroid Build Coastguard Worker    punpckhbw       m19, m18, m17
757*c0909341SAndroid Build Coastguard Worker    punpcklbw       m18, m17     ; { luma, chroma }
758*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m19, m14
759*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m18, m14
760*c0909341SAndroid Build Coastguard Worker    psraw           m19, 6
761*c0909341SAndroid Build Coastguard Worker    psraw           m18, 6
762*c0909341SAndroid Build Coastguard Worker    paddw           m19, m15
763*c0909341SAndroid Build Coastguard Worker    paddw           m18, m15
764*c0909341SAndroid Build Coastguard Worker    packuswb        m18, m19
765*c0909341SAndroid Build Coastguard Worker.add_noise_main:
766*c0909341SAndroid Build Coastguard Worker    mova            m19, m0
767*c0909341SAndroid Build Coastguard Worker    vpermt2b        m19, m18, m1 ; scaling[  0..127]
768*c0909341SAndroid Build Coastguard Worker    vpmovb2m         k2, m18
769*c0909341SAndroid Build Coastguard Worker    vpermi2b        m18, m2, m3  ; scaling[128..255]
770*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m19{k2}, m18     ; scaling[src]
771*c0909341SAndroid Build Coastguard Worker    pshufb          m19, m4
772*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m18, m19, m20
773*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m19, m21
774*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*2<<%2
775*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
776*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+strideq*(2<<%2)]
777*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m18, m6      ; noise
778*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m19, m6
779*c0909341SAndroid Build Coastguard Worker    punpcklbw       m16, m17, m5 ; chroma
780*c0909341SAndroid Build Coastguard Worker    punpckhbw       m17, m5
781*c0909341SAndroid Build Coastguard Worker    paddw           m16, m18
782*c0909341SAndroid Build Coastguard Worker    paddw           m17, m19
783*c0909341SAndroid Build Coastguard Worker    packuswb        m16, m17
784*c0909341SAndroid Build Coastguard Worker    pmaxub          m16, m7
785*c0909341SAndroid Build Coastguard Worker    pminub          m16, m8
786*c0909341SAndroid Build Coastguard Worker%if %2
787*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm16
788*c0909341SAndroid Build Coastguard Worker    vextracti128  [dstq+strideq*1], ym16, 1
789*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m16, 2
790*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m16, 3
791*c0909341SAndroid Build Coastguard Worker%else
792*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym16
793*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m16, 1
794*c0909341SAndroid Build Coastguard Worker%endif
795*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+strideq*(2<<%2)]
796*c0909341SAndroid Build Coastguard Worker    ret
797*c0909341SAndroid Build Coastguard Worker%else
798*c0909341SAndroid Build Coastguard Worker    jmp .add_noise_main
799*c0909341SAndroid Build Coastguard Worker%endif
800*c0909341SAndroid Build Coastguard Worker%endmacro
801*c0909341SAndroid Build Coastguard Worker
802*c0909341SAndroid Build Coastguard Worker    %%FGUV_32x32xN_LOOP 1, %2, %3
803*c0909341SAndroid Build Coastguard Worker.csfl:
804*c0909341SAndroid Build Coastguard Worker    %%FGUV_32x32xN_LOOP 0, %2, %3
805*c0909341SAndroid Build Coastguard Worker.end:
806*c0909341SAndroid Build Coastguard Worker    RET
807*c0909341SAndroid Build Coastguard Worker%endmacro
808*c0909341SAndroid Build Coastguard Worker
809*c0909341SAndroid Build Coastguard WorkerFGUV_FN 420, 1, 1
810*c0909341SAndroid Build Coastguard WorkerFGUV_FN 422, 1, 0
811*c0909341SAndroid Build Coastguard WorkerFGUV_FN 444, 0, 0
812*c0909341SAndroid Build Coastguard Worker
813*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
814